In [1]:
from py2neo import Graph,Node,Relationship,NodeMatcher
import pandas as pd
import numpy as np
from multiprocessing.pool import ThreadPool
import akshare as ak
import logging
from datetime import datetime, timedelta
from tqdm import tqdm
import os
from finllmqa.api.core import NEO4J_API_URL, STOCK_KG_USER, STOCK_KG_PW

pool = ThreadPool(10)
# 连接Neo4j数据库
graph = Graph(NEO4J_API_URL, auth=(STOCK_KG_USER, STOCK_KG_PW), name='neo4j')
matcher = NodeMatcher(graph)



In [2]:
def get_logger(name):
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    
    # 以下两行是为了在jupyter notebook 中不重复输出日志
    if logger.root.handlers:
        logger.root.handlers[0].setLevel(logging.WARNING)
 
    handler_stdout = logging.StreamHandler()
    handler_stdout.setLevel(logging.INFO)
    handler_stdout.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler_stdout)
 
    handler_file = logging.FileHandler('log_file.log', encoding='utf-8')
    handler_file.setLevel(logging.DEBUG)
    handler_file.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler_file)
 
    return logger
log = get_logger(None)

# Construct financial knowledge graph of stock

In [13]:
# get stock data
df_stock_code = ak.stock_info_a_code_name()
stock_code_list = df_stock_code['code'].to_list()[:40]

In [6]:
# create stock node constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:股票) REQUIRE n.代码 IS UNIQUE'
graph.run(cypher=cypher)

# create stock node
for code, name in df_stock_code.values:
    stock_node = Node('股票', 代码=code, 名称=name, name=name)
    try:
        graph.create(stock_node)
    except:
        continue

## 1. Financial Abstract

In [14]:
# create financial abstract constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:财务指标) REQUIRE (n.股票代码, n.指标类型) IS UNIQUE'
graph.run(cypher=cypher)
df_fin_ab_demo = ak.stock_financial_abstract(symbol='000001')
for abstract_type in df_fin_ab_demo['选项']:
    # create financial abstract indicators constraint
    cypher = f'CREATE CONSTRAINT IF NOT EXISTS FOR (n:{abstract_type}) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
    graph.run(cypher=cypher)

for code in tqdm(stock_code_list):
    # stock = matcher.match("股票", 代码=code).first()
    # fin_ab = matcher.match("财务指标", 股票代码=code, 指标类型='常用指标').first()
    # if stock is not None and fin_ab is not None:
    #     continue
    # get financial abstract info
    try:
        df_fin_ab = ak.stock_financial_abstract(symbol=code).copy()
    except:
        log.warning(f'Can not get {code} financial abstract info')
        
    for abstract_type, df_group in df_fin_ab.groupby('选项'):
        # create financial abstract node
        fin_ab_node = Node('财务指标', 股票代码=code, 指标类型=abstract_type, name=abstract_type)
        try:
            graph.create(fin_ab_node)
        except:
            pass

        # create stock - financial abstract relationship
        stock = matcher.match("股票", 代码=code).first()
        fin_ab = matcher.match("财务指标", 股票代码=code, 指标类型=abstract_type).first()
        realation = Relationship(stock, '基本面', fin_ab)
        try:
            graph.create(realation)
        except:
            pass

        # create financial abstract indicators node
        df = df_group.drop('选项', axis=1).set_index('指标').copy()
        df = df.T.reset_index(drop=False, names='报告期').copy()
        df['股票代码'] = code
        df['报告期'] = pd.to_datetime(df['报告期'])
        df = df[df['报告期'] > datetime(2020,12,31)]
        df['报告期'] = df['报告期'].dt.strftime('%Y-%m-%d')
        df = df.fillna('null').copy()
        for record in df.to_dict('records'):
            record.update({'name': record['报告期']})
            fin_ab_indicator_node = Node(abstract_type, **record)
            try:
                graph.create(fin_ab_indicator_node)
            except:
                pass

        # create financial abstract-financial abstract indicators relationship
        fin_ab = matcher.match("财务指标",股票代码=code, 指标类型=abstract_type).first()
        fin_ab_indicator_ls = matcher.match(abstract_type, 股票代码=code)
        for fin_ab_indicator in fin_ab_indicator_ls: 
            realation = Relationship(fin_ab, '按报告期', fin_ab_indicator)
            try:
                graph.create(realation)
            except:
                pass

100%|██████████| 40/40 [00:53<00:00,  1.33s/it]


## 2. Research Report

In [8]:
def _process_reasearch_report(df: pd.DataFrame):
    df['日期'] = pd.to_datetime(df['日期'])
    df = df[df['日期'] >= datetime.now() - timedelta(365)].copy()
    df_rsh_repo = df[['股票代码']].drop_duplicates().copy()
    df_rsh_repo['近一年研报数量'] = len(df)
    df_rsh_repo['买入/增持评级数量'] = len(df[df['东财评级'].isin(['买入', '增持'])])
    if df['2024-盈利预测-收益'].isna().sum() == len(df):
        df_rsh_repo['2024盈利预测-收益'] = 'null'
    else:
        df_rsh_repo['2024盈利预测-收益'] = df[~df['2024-盈利预测-收益'].isna()]['2024-盈利预测-收益'].mean()
    if df['2024-盈利预测-市盈率'].isna().sum() == len(df):
        df_rsh_repo['2024盈利预测-市盈率'] = 'null'
    else:
        df_rsh_repo['2024盈利预测-市盈率'] = df[~df['2024-盈利预测-市盈率'].isna()]['2024-盈利预测-市盈率'].mean()
    if df['2025-盈利预测-收益'].isna().sum() == len(df):
        df_rsh_repo['2025盈利预测-收益'] = 'null'
    else:
        df_rsh_repo['2025盈利预测-收益'] = df[~df['2025-盈利预测-收益'].isna()]['2025-盈利预测-收益'].mean()
    if df['2025-盈利预测-市盈率'].isna().sum() == len(df):
        df_rsh_repo['2025盈利预测-市盈率'] = 'null'
    else:
        df_rsh_repo['2025盈利预测-市盈率'] = df[~df['2025-盈利预测-市盈率'].isna()]['2025-盈利预测-市盈率'].mean()
    return df_rsh_repo

In [9]:
df = ak.stock_research_report_em(symbol='000001')

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
df

Unnamed: 0,序号,股票代码,股票简称,报告名称,东财评级,机构,近一月个股研报数,2024-盈利预测-收益,2024-盈利预测-市盈率,2025-盈利预测-收益,2025-盈利预测-市盈率,2026-盈利预测-收益,2026-盈利预测-市盈率,行业,日期,报告PDF链接
0,1,000001,平安银行,信贷存款回暖，计息成本改善,增持,天风证券,1,,,2.31,4.84,2.38,4.70,银行,2025-04-22,https://pdf.dfcfw.com/pdf/H3_AP202504221660307...
1,2,000001,平安银行,年报点评报告：非息亮眼，质量稳健,增持,天风证券,1,,,2.43,4.74,2.52,4.56,银行,2025-03-19,https://pdf.dfcfw.com/pdf/H3_AP202503191644684...
2,3,000001,平安银行,2024年年报点评：结构优化见效，风险处置有力,买入,民生证券,1,,,,,,,银行,2025-03-18,https://pdf.dfcfw.com/pdf/H3_AP202503181644529...
3,4,000001,平安银行,点评报告：结构调整持续，资产质量指标变动不大,增持,万联证券,1,,,2.36,5.08,2.38,5.03,银行,2025-03-18,https://pdf.dfcfw.com/pdf/H3_AP202503181644488...
4,5,000001,平安银行,贷款结构优化，营收降幅收窄,中性,国信证券,1,,,,,,,银行,2025-03-16,https://pdf.dfcfw.com/pdf/H3_AP202503161644416...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276,277,000001,平安银行,零售业务核心策略不变,,中银国际,1,,,,,,,银行,2017-03-22,https://pdf.dfcfw.com/pdf/H3_AP201703220427468...
277,278,000001,平安银行,2016年年报点评：业绩低于预期，估值水平低于同业,增持,东莞证券,1,,,,,,,银行,2017-03-20,https://pdf.dfcfw.com/pdf/H3_AP201703200421759...
278,279,000001,平安银行,2017年报业绩点评：零售银行初见成效，不良加速认定,,民生证券,1,,,,,,,银行,2017-03-17,https://pdf.dfcfw.com/pdf/H3_AP201703170415156...
279,280,000001,平安银行,零售金融盈利贡献度大幅提升，息差环比回升,买入,交银国际证券,1,,,,,,,银行,2017-03-07,https://pdf.dfcfw.com/pdf/H3_AP201705190591661...


In [12]:
_process_reasearch_report(df)

Unnamed: 0,股票代码,近一年研报数量,买入/增持评级数量,2024盈利预测-收益,2024盈利预测-市盈率,2025盈利预测-收益,2025盈利预测-市盈率
0,1,15,11,2.396667,4.59,2.425556,4.611111


In [15]:
# create research report constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:个股研报) REQUIRE (n.股票代码) IS UNIQUE'
graph.run(cypher=cypher)


# get research report info
df_rsh_repo_ls = []
for code in tqdm(stock_code_list):
    try:
        df_rsh_repo = ak.stock_research_report_em(symbol=code).copy()
        df_rsh_repo = _process_reasearch_report(df_rsh_repo)
        df_rsh_repo_ls.append(df_rsh_repo)
    except:
        continue
        # log.warning(f'{code} has no research report info')
if len(df_rsh_repo_ls) == 0:
    raise ConnectionError('Can not get stock research report from akshare')
else:
    df_rsh_repo = pd.concat(df_rsh_repo_ls, ignore_index=True)
    
    
for record in tqdm(df_rsh_repo.to_dict('records')):
    # create research report nodeb
    rsh_report_node = Node('个股研报', name='个股研报', **record)
    try:
        graph.create(rsh_report_node)
    except:
        pass
    
    # create stock - main business composition relationship
    stock = matcher.match("股票", 代码=record['股票代码']).first()
    if stock is None:
        continue
    rsh_report = matcher.match("个股研报", 股票代码=record['股票代码']).first()
    realation = Relationship(stock, '基本面', rsh_report)
    try:
        graph.create(realation)
    except:
        pass

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  2%|▎         | 1/40 [00:01<00:59,  1.53s/it]

  0%|          | 0/4 [00:00<?, ?it/s]

  5%|▌         | 2/40 [00:03<01:03,  1.66s/it]

0it [00:00, ?it/s]

  8%|▊         | 3/40 [00:03<00:38,  1.04s/it]

  0%|          | 0/1 [00:00<?, ?it/s]

 10%|█         | 4/40 [00:04<00:31,  1.15it/s]

0it [00:00, ?it/s]

 12%|█▎        | 5/40 [00:04<00:22,  1.52it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 15%|█▌        | 6/40 [00:05<00:21,  1.61it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 18%|█▊        | 7/40 [00:05<00:20,  1.62it/s]

0it [00:00, ?it/s]

 20%|██        | 8/40 [00:05<00:16,  1.94it/s]

0it [00:00, ?it/s]

 22%|██▎       | 9/40 [00:06<00:13,  2.26it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 25%|██▌       | 10/40 [00:06<00:14,  2.04it/s]

0it [00:00, ?it/s]

 28%|██▊       | 11/40 [00:07<00:12,  2.36it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 30%|███       | 12/40 [00:07<00:13,  2.12it/s]

0it [00:00, ?it/s]

 32%|███▎      | 13/40 [00:07<00:11,  2.39it/s]

0it [00:00, ?it/s]

 35%|███▌      | 14/40 [00:08<00:10,  2.54it/s]

0it [00:00, ?it/s]

 38%|███▊      | 15/40 [00:08<00:09,  2.75it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 40%|████      | 16/40 [00:09<00:10,  2.33it/s]

0it [00:00, ?it/s]

 42%|████▎     | 17/40 [00:09<00:08,  2.59it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 45%|████▌     | 18/40 [00:10<00:09,  2.22it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 48%|████▊     | 19/40 [00:10<00:10,  2.04it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 50%|█████     | 20/40 [00:11<00:10,  1.93it/s]

0it [00:00, ?it/s]

 52%|█████▎    | 21/40 [00:11<00:08,  2.25it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 55%|█████▌    | 22/40 [00:12<00:10,  1.67it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 57%|█████▊    | 23/40 [00:13<00:10,  1.69it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 60%|██████    | 24/40 [00:13<00:09,  1.72it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

 62%|██████▎   | 25/40 [00:14<00:10,  1.42it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 65%|██████▌   | 26/40 [00:15<00:09,  1.51it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 68%|██████▊   | 27/40 [00:15<00:08,  1.55it/s]

0it [00:00, ?it/s]

 70%|███████   | 28/40 [00:16<00:06,  1.86it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 72%|███████▎  | 29/40 [00:16<00:07,  1.55it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 75%|███████▌  | 30/40 [00:17<00:06,  1.60it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 78%|███████▊  | 31/40 [00:18<00:06,  1.35it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 80%|████████  | 32/40 [00:19<00:05,  1.44it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 82%|████████▎ | 33/40 [00:19<00:04,  1.50it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 85%|████████▌ | 34/40 [00:20<00:03,  1.57it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 88%|████████▊ | 35/40 [00:21<00:03,  1.37it/s]

0it [00:00, ?it/s]

 90%|█████████ | 36/40 [00:21<00:02,  1.68it/s]

0it [00:00, ?it/s]

 92%|█████████▎| 37/40 [00:21<00:01,  2.00it/s]

0it [00:00, ?it/s]

 95%|█████████▌| 38/40 [00:22<00:00,  2.31it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

 98%|█████████▊| 39/40 [00:22<00:00,  2.04it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 40/40 [00:23<00:00,  1.72it/s]
  df_rsh_repo = pd.concat(df_rsh_repo_ls, ignore_index=True)
100%|██████████| 11/11 [00:00<00:00, 58.69it/s]


## 3. Main Business Composition

In [16]:
def assign_exchange_prefix(code: str):
    if code.startswith('0') or code.startswith('3'):
        code = 'SZ' + code
    else:
        code = 'SH' + code
    return code

In [17]:
# create main business composition constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主营构成) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
graph.run(cypher=cypher)

# create main business constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主营业务) REQUIRE (n.股票代码, n.报告期, n.业务名称) IS UNIQUE'
graph.run(cypher=cypher)


for code in tqdm(stock_code_list):
    # get main business info
    try:
        symbol = assign_exchange_prefix(code)
        df_main_business = ak.stock_zygc_em(symbol=symbol).copy()
        df_main_business = df_main_business.fillna('nan').copy()
    except:
        log.warning(f'Can not get {code} main business info')
        continue
        
    df_main_business = df_main_business[df_main_business['分类类型'] == '按产品分类'].copy()
    df_main_business['报告日期'] = pd.to_datetime(df_main_business['报告日期'])
    df_main_business = df_main_business[df_main_business['报告日期'] > datetime(2020,12,31)].copy()
    df_main_business['报告日期'] = df_main_business['报告日期'].dt.strftime('%Y-%m-%d')
    for report_time, df_group in df_main_business.groupby('报告日期'):
        # create main business composition node
        main_bus_compo_node = Node('主营构成', 股票代码=code, 报告期=report_time, name=report_time)
        try:
            graph.create(main_bus_compo_node)
        except:
            pass

        # create stock - main business composition relationship
        stock = matcher.match("股票", 代码=code).first()
        main_bus_compo = matcher.match("主营构成", 股票代码=code, 报告期=report_time).first()
        realation = Relationship(stock, '基本面', main_bus_compo)
        try:
            graph.create(realation)
        except:
            pass

        # create main business node
        df_group = df_group[['股票代码', '报告日期', '主营构成', '主营收入', '收入比例', '主营成本',
                             '成本比例', '主营利润', '利润比例', '毛利率']].copy()
        df_group = df_group.rename(columns={'主营构成': '业务名称', '报告日期': '报告期'})
        for record in df_group.to_dict('records'):
            record.update({'name': record['业务名称']})
            main_business_node = Node('主营业务', **record)
            try:
                graph.create(main_business_node)
            except:
                pass

        # create main business composition-main business relationship
        main_bus_compo = matcher.match("主营构成", 股票代码=code, 报告期=report_time).first()
        main_bus_ls = matcher.match("主营业务", 股票代码=code, 报告期=report_time)
        for main_bus in main_bus_ls: 
            realation = Relationship(main_bus_compo, '按产品分类', main_bus)
            try:
                graph.create(realation)
            except:
                pass

100%|██████████| 40/40 [00:22<00:00,  1.82it/s]


## 4. Main Shareholders

In [18]:
def assign_exchange_prefix(code: str):
    if code.startswith('0') or code.startswith('3'):
        code = 'sz' + code
    else:
        code = 'sh' + code
    return code

In [22]:
def get_all_quaterly_report_time(look_back_period: int = 12):
    """
    get all quarterly report time
    :param look_back_period: look back period, default is 4
    :return: all quarterly report time
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(look_back_period * 90)
    all_quaterly_report_time = pd.date_range(start=start_date, end=end_date, freq='Q')
    all_quaterly_report_time = all_quaterly_report_time.strftime('%Y%m%d').to_list()
    return all_quaterly_report_time
    

In [24]:
# create main shareholders constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:主要股东) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
graph.run(cypher=cypher)

# create shares holding constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:持股信息) REQUIRE (n.股票代码, n.报告期, n.股东名称) IS UNIQUE'
graph.run(cypher=cypher)

report_time_ls = get_all_quaterly_report_time()

for code in tqdm(stock_code_list):
    # get main shareholders info
    for report_time in report_time_ls:
        try:
            symbol = assign_exchange_prefix(code)
            df_main_shareholders = ak.stock_gdfx_top_10_em(symbol=symbol, date=report_time).copy()
        except:
            log.warning(f'Can not get {code} {report_time} main shareholders info')
            continue

        df_main_shareholders = df_main_shareholders[:3].fillna('nan').copy()
        # create main shareholders node
        main_shareholders_node = Node('主要股东', 股票代码=code, 报告期=report_time, name=report_time)
        try:
            graph.create(main_shareholders_node)
        except:
            break

        # create stock - main shareholders relationship
        stock = matcher.match("股票", 代码=code).first()
        main_shareholders = matcher.match("主要股东", 股票代码=code, 报告期=report_time).first()
        realation = Relationship(stock, '基本面', main_shareholders)
        try:
            graph.create(realation)
        except:
            pass

        # create shares holding node
        for record in df_main_shareholders.to_dict('records'):
            record.update({'name': record['股东名称'], '股票代码': code, '报告期': report_time})
            shares_holding = Node('持股信息', **record)
            try:
                graph.create(shares_holding)
            except:
                break

        # create main business composition-main business relationship
        main_shareholders = matcher.match("主要股东", 股票代码=code, 报告期=report_time).first()
        shares_holding_ls = matcher.match("持股信息", 股票代码=code, 报告期=report_time)
        for shares_holding in shares_holding_ls: 
            realation = Relationship(main_shareholders, '按持股比例', shares_holding)
            try:
                graph.create(realation)
            except:
                break

  all_quaterly_report_time = pd.date_range(start=start_date, end=end_date, freq='Q')
100%|██████████| 40/40 [01:49<00:00,  2.73s/it]


## 5.Price Indicators

In [25]:
import akshare as ak

stock_zh_a_hist_df = ak.stock_zh_a_hist(symbol="000004", period="daily", start_date="20230412", end_date='20240412', adjust="hfq")
stock_zh_a_hist_df

Unnamed: 0,日期,股票代码,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌幅,涨跌额,换手率
0,2023-04-12,000004,35.05,35.78,36.22,35.05,22748,22871246.00,3.33,1.85,0.65,1.91
1,2023-04-13,000004,35.82,36.02,36.46,35.54,19624,19884792.00,2.57,0.67,0.24,1.65
2,2023-04-14,000004,36.02,35.70,36.26,35.09,28161,28173191.90,3.25,-0.89,-0.32,2.37
3,2023-04-17,000004,35.66,36.67,37.72,35.58,40813,42057949.00,5.99,2.72,0.97,3.43
4,2023-04-18,000004,36.59,37.27,37.39,36.26,25010,25822813.44,3.08,1.64,0.60,2.10
...,...,...,...,...,...,...,...,...,...,...,...,...
237,2024-04-08,000004,49.75,48.66,50.84,48.46,61313,82540929.60,4.79,-2.03,-1.01,4.86
238,2024-04-09,000004,48.46,48.74,48.99,47.17,51947,68222889.00,3.74,0.16,0.08,4.11
239,2024-04-10,000004,48.06,45.88,48.78,45.31,58559,74440448.00,7.12,-5.87,-2.86,4.64
240,2024-04-11,000004,45.59,46.40,47.33,45.19,44937,57385983.25,4.66,1.13,0.52,3.56


In [26]:
def process_date_to_report_time(date: datetime):
    date = pd.to_datetime(date)
    year = date.year
    first_season_end_date = datetime(year, 3, 31)
    second_season_end_date = datetime(year, 6, 30)
    third_season_end_date = datetime(year, 9, 30)
    last_season_end_date = datetime(year, 12, 31)
    if date < first_season_end_date:
        return first_season_end_date.strftime('%Y-%m-%d')
    if date < second_season_end_date:
        return second_season_end_date.strftime('%Y-%m-%d')
    if date < third_season_end_date:
        return third_season_end_date.strftime('%Y-%m-%d')
    if date < last_season_end_date:
        return last_season_end_date.strftime('%Y-%m-%d')
    
def MA(data, n):
    MA = pd.Series(data['收盘'].rolling(n).mean().round(2), name='MA_' + str(n))
    return MA.fillna('nan')

def EMA(data, n):
    EMA = pd.Series(data['收盘'].ewm(span=n, min_periods=n).mean().round(2), name='EMA_' + str(n))
    return EMA.fillna('nan')

def RSI(data, n):
    lc = data['收盘'].shift(1)
    diff = data['收盘'] - lc
    up = diff.where(diff > 0, 0)
    down = -diff.where(diff < 0, 0)
    ema_up = up.ewm(alpha=1/n, adjust=False).mean()
    ema_down = down.ewm(alpha=1/n, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = round(100 - 100 / (1 + rs), 2)
    return pd.Series(rsi, name='RSI_' + str(n)).fillna('nan')


def MACD_Level(df, n_fast, n_slow):
    data = df.copy()
    EMAfast = data['收盘'].ewm(span=n_fast, min_periods=n_slow).mean().round(2)
    EMAslow = data['收盘'].ewm(span=n_slow, min_periods=n_slow).mean().round(2)
    data['MACD'] = EMAfast - EMAslow
    data['MACDsignal'] = data['MACD'].ewm(span=9, min_periods=9).mean().round(2)
    data['MACDhist'] = data['MACD'] - data['MACDsignal']
    return data[['MACD', 'MACDsignal', 'MACDhist']].fillna('nan')


In [27]:
# create price data constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:基础行情) REQUIRE (n.股票代码, n.基础行情类型) IS UNIQUE'
graph.run(cypher=cypher)

# create price indicators constraint
cypher = 'CREATE CONSTRAINT IF NOT EXISTS FOR (n:技术指标) REQUIRE (n.股票代码, n.技术指标类型) IS UNIQUE'
graph.run(cypher=cypher)


start_date = (datetime.now() - timedelta(365)).strftime('%Y%m%d')
end_date = datetime.now().strftime('%Y%m%d')

neccessry_price_data_type = ['收盘', '成交量', '涨跌幅', '换手率']
extra_price_data_type = ['总市值', '市盈率(静)', '市盈率(TTM)', '市净率']

for code in tqdm(stock_code_list):
    stock = matcher.match("股票", 代码=code).first()
    if stock is None:
        continue
    # get price info
    try:
        df_price = ak.stock_zh_a_hist(symbol=code, period="daily", start_date=start_date, end_date=end_date, adjust="hfq")
        df_price = df_price[['日期'] + neccessry_price_data_type].copy()
        df_price['报告期'] = df_price['日期'].apply(process_date_to_report_time)
        df_price['日期'] = df_price['日期'].astype(str)
    except:
        log.warning(f'Can not get {code} price data')
        continue

    for price_type in extra_price_data_type:
        try:
            df_price_extra = ak.stock_zh_valuation_baidu(symbol=code, indicator=price_type, period="近一年")
            df_price_extra = df_price_extra.rename(columns={'date': '日期', 'value': price_type})
            df_price_extra['日期'] = df_price_extra['日期'].astype(str)
        except:
            log.warning(f'Can not get {code} {price_type} data')
            df_price_extra = pd.DataFrame(columns=['日期', price_type])
        df_price = df_price.merge(df_price_extra, on='日期', how='left', validate='1:1')
        df_price[price_type] = df_price[price_type].fillna('nan')


    for price_type in neccessry_price_data_type + extra_price_data_type:
        # create price data node
        price_data_node = Node('基础行情', 股票代码=code, 基础行情类型=price_type, name=price_type)
        try:
            graph.create(price_data_node)
        except:
            pass

        # create stock - main shareholders relationship
        stock = matcher.match("股票", 代码=code).first()
        price_data = matcher.match("基础行情", 股票代码=code, 基础行情类型=price_type).first()
        realation = Relationship(stock, '行情', price_data)
        try:
            graph.create(realation)
        except:
            pass

        for report_time, df in df_price.groupby('报告期'):
            cypher = f'CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{price_type}`) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
            graph.run(cypher=cypher)

            assert price_type in df.columns, f'{price_type} not in {code} {report_time} price data'
            attribute = {}
            for date, value in df[['日期', price_type]].values:
                attribute[date] = value

            price_detail_node = Node(price_type, 股票代码=code, 报告期=report_time, name=report_time, **attribute)
            try:
                graph.create(price_detail_node)
            except:
                continue

            # create price_data - price_detail relationship
            price_data = matcher.match("基础行情", 股票代码=code, 基础行情类型=price_type).first()
            price_detail = matcher.match(price_type, 股票代码=code, 报告期=report_time).first()
            realation = Relationship(price_data, '基础行情数据', price_detail)
            try:
                graph.create(realation)
            except:
                pass
    
    df_indicators = df_price[['日期', '报告期', '收盘']].copy()
    df_indicators['五日移动均线(MA5)'] = MA(data=df_indicators, n=5)
    df_indicators['二十日移动均线(MA20)'] = MA(data=df_indicators, n=20)
    df_indicators['五日指数移动均线(EMA5)'] = EMA(data=df_indicators, n=5)
    df_indicators['二十日指数移动均线(EMA20)'] = EMA(data=df_indicators, n=20)
    df_indicators['相对强弱指标(RSI)'] = RSI(data=df_indicators, n=14)
    df_indicators[['MACD', '信号线(MACD_signal)', '离差图(MACD_hist)']] = MACD_Level(df=df_indicators, n_fast=12, n_slow=26)
    df_indicators = df_indicators.drop('收盘', axis=1)
    

    for col in df_indicators.columns:
        if col not in ['日期', '报告期']:
            idicator_type = col
        else:
            continue
        # create price idicator type node
        price_idicator_node = Node('技术指标', 股票代码=code, 技术指标类型=idicator_type, name=idicator_type)
        try:
            graph.create(price_idicator_node)
        except:
            pass

        # create stock - price idicator relationship
        stock = matcher.match("股票", 代码=code).first()
        price_idicator = matcher.match("技术指标", 股票代码=code, 技术指标类型=idicator_type).first()
        realation = Relationship(stock, '行情', price_idicator)
        try:
            graph.create(realation)
        except:
            pass

        for report_time, df in df_indicators.groupby('报告期'):
            cypher = f'CREATE CONSTRAINT IF NOT EXISTS FOR (n:`{idicator_type}`) REQUIRE (n.股票代码, n.报告期) IS UNIQUE'
            graph.run(cypher=cypher)

            assert idicator_type in df.columns, f'{idicator_type} not in {code} {report_time} price data'
            attribute = {}
            for date, value in df[['日期', idicator_type]].values:
                value = round(value, 2) if value !='nan' else value
                attribute[date] = value

            indicator_detail = Node(idicator_type, 股票代码=code, 报告期=report_time, name=report_time, **attribute)
            try:
                graph.create(indicator_detail)
            except:
                pass

            # create price_indicator - indicator_detail relationship
            price_indicator = matcher.match("技术指标", 股票代码=code, 技术指标类型=idicator_type).first()
            indicator_detail = matcher.match(idicator_type, 股票代码=code, 报告期=report_time).first()
            realation = Relationship(price_indicator, '技术指标数据', indicator_detail)
            try:
                graph.create(realation)
            except:
                pass

100%|██████████| 40/40 [01:31<00:00,  2.28s/it]
