# Set Data API
* We used AKShare API(https://github.com/akfamily/akshare) in the following data aquisition process.
* Should point out that the original "ak.stock_news_em" stock news interface only support 100 most recent news, which may not be enought for training data formulation. So we added a new request parameter "page" to the source function to achieve multiple pages access.

In [None]:
! pip install akshare --upgrade

## Save News Data to Local

In [1]:
import os
os.chdir("/Users/mac/Desktop/FinGPT_Forecasting_Project/")

In [2]:
import pandas as pd
HS300_stock = pd.read_csv("HS300Index.csv", header=None)
HS_index = HS300_stock[0].tolist()
HS_index = ["0"*(6-len(str(i)))+str(i) for i in HS_index]

In [53]:
# with open("HS300_Index.txt", 'w') as f:
#     write_item = "\n".join(HS_index)
#     f.write(write_item)
# f.close()

In [13]:
# Data Aquisition Pipeline
# from datetime import date
# import akshare as ak
# name_dir = "HS300_news_data" + str(date.today().strftime("%Y%m%d"))
# # os.makedirs(name_dir, exist_ok=True)
# os.chdir(name_dir)
# for i in HS_index[2:]:
#     file_name = "news_data" + i + ".csv"
#     df_list = []
#     for j in range(1, 20):
#         try:
#             df_list.append(ak.stock_news_em(symbol=i, page=j))
#         except KeyError:
#             print(str(j) + "pages obtained for symbol: " + i)
#             break
#     news_df_i = pd.concat(df_list, ignore_index=True)
#     news_df_i.to_csv(file_name)
#     print("================== symbol"+i+"completed! ==================")

11pages obtained for symbol: 000063
6pages obtained for symbol: 000069
9pages obtained for symbol: 000100
4pages obtained for symbol: 000157
6pages obtained for symbol: 000166
7pages obtained for symbol: 000301
10pages obtained for symbol: 000333
5pages obtained for symbol: 000338
5pages obtained for symbol: 000408
4pages obtained for symbol: 000425
5pages obtained for symbol: 000538
10pages obtained for symbol: 000568
6pages obtained for symbol: 000596
3pages obtained for symbol: 000617
11pages obtained for symbol: 000625
9pages obtained for symbol: 000651
9pages obtained for symbol: 000661
3pages obtained for symbol: 000708
3pages obtained for symbol: 000723
8pages obtained for symbol: 000725
3pages obtained for symbol: 000733
3pages obtained for symbol: 000768
5pages obtained for symbol: 000776
3pages obtained for symbol: 000786
6pages obtained for symbol: 000792
6pages obtained for symbol: 000800
11pages obtained for symbol: 000858
8pages obtained for symbol: 000876
3pages obtained

# Data Aquisition

In [3]:
import akshare as ak
import pandas as pd
import os
import csv
import re
import time
import math
import json
import random
from datasets import Dataset
import datasets

# os.chdir("/Users/mac/Desktop/FinGPT_Forecasting_Project/")
# print(os.getcwd())

start_date = "20230406"
end_date = "20230502"

In [4]:
def return_transform(ret):
    
    up_down = '涨' if ret >= 0 else '跌'
    integer = math.ceil(abs(100 * ret))
    if integer == 0:
        return "平"
    
    return up_down + (str(integer) if integer <= 5 else '5+')

def get_return(symbol, adjust="hfq"):
    """
    Get stock return data.

    Args:
        symbol: str
            A-share market stock symbol
        adjust: str ("qfq", "hfq")
            price ajustment
            default = "hfq" 后复权
    
    Return:
        weekly forward filled return data
    """
    
    # load data
    return_data = ak.stock_zh_a_hist(symbol=symbol, period="daily", start_date=start_date, end_date=end_date, adjust=adjust)
    
    # process timestamp
    return_data["日期"] = pd.to_datetime(return_data["日期"])
    return_data.set_index("日期", inplace=True)

    # resample and filled with forward data
    weekly_data = return_data["收盘"].resample("W").ffill()
    weekly_returns = weekly_data.pct_change()[1:]
    weekly_start_prices = weekly_data[:-1]
    weekly_end_prices = weekly_data[1:]
    weekly_data = pd.DataFrame({
        '起始日期': weekly_start_prices.index,
        '起始价': weekly_start_prices.values,
        '结算日期': weekly_end_prices.index,
        '结算价': weekly_end_prices.values,
        '周收益': weekly_returns.values
    })
    weekly_data["简化周收益"] = weekly_data["周收益"].map(return_transform)
    
    return weekly_data

# get basics
def get_basic(symbol, data, no_validrange = False):
    """
    Get and match basic data to news dataframe.

    Args:
        symbol: str
            A-share market stock symbol
        data: DataFrame
            dated news data
        no_validrange: bool
            False to set range for the valid basic_info.
            default valid range is 3 weeks after the basic_info reported
    
    Return:
        financial news dataframe with matched basic_financial info
    """

    # load quarterly basic data
    basic_quarter_financials = ak.stock_financial_abstract_ths(symbol = symbol, indicator="按单季度")

    basic_fin_dict = basic_quarter_financials.to_dict("index")
    basic_fin_list = [dict([(key, val) for key, val in basic_fin_dict[i].items() if val]) for i in range(len(basic_fin_dict))]

    # match basic financial data to news dataframe
    matched_basic_fin = []
    for i, row in data.iterrows():

        newsweek_enddate = row['结算日期'].strftime("%Y-%m-%d")
        
        # set 3 weeks for the valid time of quarterly basic_financial info 
        check_date = start_date if i<2 else data.loc[i-2, '起始日期'].strftime("%Y-%m-%d")

        matched_basic = {}
        for basic in basic_fin_list:
            if (no_validrange and basic["报告期"] < newsweek_enddate) or (check_date <= basic["报告期"] < newsweek_enddate):
                matched_basic = basic
                break
        matched_basic_fin.append(json.dumps(matched_basic, ensure_ascii=False))

    data['基本面'] = matched_basic_fin

    return data

def raw_financial_data(symbol, with_basics = True, no_validrange = True):
    
    # get return data from API
    data = get_return(symbol=symbol)
    
    # get news data from local
    file_name = "news_data" + symbol + ".csv"
    news_df = pd.read_csv("HS300_news_data20240118/"+file_name, index_col=0)
    news_df["发布时间"] = pd.to_datetime(news_df["发布时间"], exact=False, format="%Y-%m-%d")
    news_df.sort_values(by=["发布时间"], inplace=True)
    
    # match weekly news for return data
    news_list = []
    for a, row in data.iterrows():
        week_start_date = row['起始日期'].strftime('%Y-%m-%d')
        week_end_date = row['结算日期'].strftime('%Y-%m-%d')
        print(symbol, ': ', week_start_date, ' - ', week_end_date)
        
        weekly_news = news_df.loc[(news_df["发布时间"]>week_start_date) & (news_df["发布时间"]<week_end_date)]

        weekly_news = [
            {
                "发布时间": n["发布时间"].strftime('%Y%m%d'),
                "新闻标题": n['新闻标题'],
                "新闻内容": n['新闻内容'],
            } for a, n in weekly_news.iterrows()
        ]
        news_list.append(json.dumps(weekly_news,ensure_ascii=False))

    data["新闻"] = news_list

    if with_basics:
        data = get_basic(symbol=symbol, data=data, no_validrange=no_validrange)
        # data.to_csv(symbol+start_date+"_"+end_date+".csv")
    else:
        data['新闻'] = [json.dumps({})] * len(data)
        # data.to_csv(symbol+start_date+"_"+end_date+"_nobasics.csv")
    
    return data

# Prompt Generation

In [5]:
def get_company_prompt(symbol):
    """
    Get company information from API
    """
    try:
        company_profile = dict(ak.stock_individual_info_em(symbol).values)
    except:
        print("Company Info Request Time Out! Please wait and retry.")
    company_profile["上市时间"] =  pd.to_datetime(company_profile["上市时间"]).strftime("%Y年%m月%d日")

    template = "[公司介绍]:\n\n{股票简称}是一家在{行业}行业的领先实体。自{上市时间}成立并公开交易以来，该公司已确立其作为市场主要参与者之一的声誉。截止今天，{股票简称}的总市值为{总市值}人民币，总股本数为{总股本}，流通市值为{流通市值}人民币，流通股数为{流通股}。" \
        "\n\n{股票简称}主要在中国运营，以股票代码{股票代码}在交易所进行交易。作为在{行业}行业的一个主导力量，该公司持续创新，不断推动行业进步。"
    
    formatted_profile = template.format(**company_profile)

    return formatted_profile

def map_return_label(return_lb):
    """
    Map abbrev in the raw data
    Example:
        涨1 -- 上涨1%
        跌2 -- 下跌2%
        平 -- 股价持平
    """

    lb = return_lb.replace('涨', '上涨')
    lb = lb.replace('跌', '下跌')
    lb = lb.replace('平', '股价持平')
    lb = lb.replace('1', '0-1%')
    lb = lb.replace('2', '1-2%')
    lb = lb.replace('3', '2-3%')
    lb = lb.replace('4', '3-4%')
    if lb.endswith('+'):
        lb = lb.replace('5+', '超过5%')
    else:
        lb = lb.replace('5', '4-5%')
    
    return lb

def get_prompt_by_row(symbol, row):
    """
    Generate prompt for each row in the raw data DF
    Args:
        symbol: str
            stock ticker
        row: pandas.Series
    Return:
        head: heading prompt
        news: news info
        basics: basic financial info
    """

    week_start_date = row['起始日期'] if isinstance(row['起始日期'], str) else row['起始日期'].strftime('%Y-%m-%d')
    week_end_date = row['结算日期'] if isinstance(row['结算日期'], str) else row['结算日期'].strftime('%Y-%m-%d')
    term = '上涨' if row['结算价'] > row['起始价'] else '下跌'
    head = "自{}至{}，{}的股票价格由{:.2f}{}至{:.2f}。在此期间的公司新闻如下所列:\n\n".format(
        week_start_date, week_end_date, symbol, row['起始价'], term, row['结算价'])

    news = json.loads(row["新闻"])
    news = ["[新闻标题]：{}\n[新闻内容]：{}\n".format(
        n['新闻标题'], n['新闻内容']) for n in news if n['发布时间'][:8] <= week_end_date.replace('-', '')]

    basics = json.loads(row['基本面'])
    if basics:
        basics = "如下所列为{}近期的一些金融基本面信息，记录时间为{}:\n\n[金融基本面]:\n\n".format(
            symbol, basics['报告期']) + "\n".join(f"{k}: {v}" for k, v in basics.items() if k != 'period')
    else:
        basics = "[金融基本面]:\n\n 无金融基本面记录"

    return head, news, basics

def sample_news(news, k=5):
    """
    Ramdomly select past news.

    Args:
        news:
            newslist in the timerange
        k: int
            the number of selected news
    """
    return [news[i] for i in sorted(random.sample(range(len(news)), k))]

def get_all_prompts(symbol, min_past_week=1, max_past_weeks=3, with_basics=True):
    """
    Generate prompt. The prompt consists of news from past weeks, basics financial information, and weekly return.
    History news in the prompt is chosen from past weeks range from min_past_week to max_past_week, 
    and there is a number constraint on ramdomly selected data (default: up to 5).

    Args:
        symbol: str
            stock ticker
        min_past_week: int
        max_past_week: int
        with_basics: bool
            If true, add basic infomation to the prompt
            
    Return:
        Prompts for the daterange
    """

    # Load Data
    df = raw_financial_data(symbol, with_basics=with_basics)
    
    company_prompt = get_company_prompt(symbol)

    prev_rows = []
    all_prompts = []

    for row_idx, row in df.iterrows():

        prompt = ""

        # judge for available history news 
        if len(prev_rows) >= min_past_week:

            # randomly set retrieve data of past weeks
            idx = min(random.choice(range(min_past_week, max_past_weeks+1)), len(prev_rows))
            for i in range(-idx, 0):
                # Add Head
                prompt += "\n" + prev_rows[i][0]
                # Add History News (with numbers constraint)
                sampled_news = sample_news(
                    prev_rows[i][1],
                    min(5, len(prev_rows[i][1]))
                )
                if sampled_news:
                    prompt += "\n".join(sampled_news)
                else:
                    prompt += "无有关新闻报告"
                    
        head, news, basics = get_prompt_by_row(symbol, row)
        
        prev_rows.append((head, news, basics))

        if len(prev_rows) > max_past_weeks:
            prev_rows.pop(0)
        
        # set this to make sure there is history news for each considered date
        if not prompt:
            continue

        prediction = map_return_label(row['简化周收益'])

        prompt = company_prompt + '\n' + prompt + '\n' + basics
        prompt += f"\n\n基于在{row['起始日期'].strftime('%Y-%m-%d')}之前的所有信息，让我们首先分析{symbol}的积极发展和潜在担忧。请简洁地陈述，分别提出2-4个最重要的因素。大部分所提及的因素应该从公司的相关新闻中推断出来。" \
            f"那么让我们假设你对于下一周({row['起始日期'].strftime('%Y-%m-%d')}至{row['结算日期'].strftime('%Y-%m-%d')})的预测是{prediction}。提供一个总结分析来支持你的预测。预测结果需要从你最后的分析中推断出来，因此不作为你分析的基础因素。"

        all_prompts.append(prompt.strip())

    return all_prompts

In [6]:
get_all_prompts("000001")

000001 :  2023-04-09  -  2023-04-16
000001 :  2023-04-16  -  2023-04-23
000001 :  2023-04-23  -  2023-04-30


['[公司介绍]:\n\n平安银行是一家在银行行业的领先实体。自1970年01月01日成立并公开交易以来，该公司已确立其作为市场主要参与者之一的声誉。截止今天，平安银行的总市值为182803749425.16人民币，总股本数为19405918198.0，流通市值为182800252269.0人民币，流通股数为19405546950.0。\n\n平安银行主要在中国运营，以股票代码000001在交易所进行交易。作为在银行行业的一个主导力量，该公司持续创新，不断推动行业进步。\n\n自2023-04-09至2023-04-16，000001的股票价格由2257.00上涨至2268.38。在此期间的公司新闻如下所列:\n\n[新闻标题]：金融业高薪招聘AI训练师 有AI产品负责人年薪超百万\n[新闻内容]：优先考虑具有拥有虚拟偶像、数字人相关产品工作经验者等。 BOSS直聘官网截图 也有一些相对“平易近人”的工作岗位。 如平安银行000001.SZ)网金及财富管理事业部招聘AI业务策略训练师，薪资待遇为1万-1.5万元/月。 平安银行介绍称，该岗位负责业务策略训练，将业务策略在智慧经营平台做落地实施，将策略与各触达方式做串联部署\n\n[新闻标题]：平安银行王蓉晖：矩阵式产品服务实现“真小微 真信用 真普惠”\n[新闻内容]：就业多则经济活。 今年3月，在平安银行开放日上，平安银行董事长谢永林表示，全新的开放银行不仅是支持零售转型战略的新打法，而且通过服务平台上的海量小微客户，平安银行在服务小微，践行普惠金融方面也探索出了新举措。 平安银行财报显示，截至2022年末，平安银行普惠型小微企业贷款累计户数105\n\n[新闻标题]：20股特大单净流入资金超2亿元\n[新闻内容]：491.562.30建筑装饰300120经纬辉开8.7819.952.28电子002051中工国际12.5010.042.25建筑装饰000001平安银行12.691.042.24银行601138工业富联18.072.322.09电子300014亿纬锂能68.904.242.07电力设备 特大单净流出资金排名\n\n如下所列为000001近期的一些金融基本面信息，记录时间为2023-03-31:\n\n[金融基本面]:\n\n报告期: 2023-03-31\n净利润: 146.02亿\n净利润同比增长率

In [6]:
SYSTEM_PROMPT = "你是一个经验丰富的股票市场分析师。你的任务是根据过去几周的相关新闻和基本财务状况，列出公司的积极发展和潜在担忧，然后对公司未来一周的股价走势提供分析和预测。" \
    "你的回答格式应该如下：\n\n[积极发展]：\n1. ...\n\n[潜在担忧]：\n1. ...\n\n[预测和分析]：\n...\n"
print(SYSTEM_PROMPT)

你是一个经验丰富的股票市场分析师。你的任务是根据过去几周的相关新闻和基本财务状况，列出公司的积极发展和潜在担忧，然后对公司未来一周的股价走势提供分析和预测。你的回答格式应该如下：

[积极发展]：
1. ...

[潜在担忧]：
1. ...

[预测和分析]：
...



# Generate Task-Response with GPT

In [7]:
from openai import OpenAI

client = OpenAI(api_key = "Your API Key")

start_date = "20230201"
end_date = "20240101"
DATA_DIR = f"./{start_date}_{end_date}"
os.makedirs(DATA_DIR, exist_ok=True)


In [8]:
def append_to_csv(filename, input_data, output_data):
    
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data, output_data])

        
def initialize_csv(filename):
    
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "answer"])

def query_gpt4(symbol_list, min_past_weeks=1, max_past_weeks=3, with_basics=True):

    for symbol in symbol_list:
        
        csv_file = f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_nobasics_gpt-4.csv'
        
        if not os.path.exists(csv_file):
            initialize_csv(csv_file)
            pre_done = 0
        else:
            df = pd.read_csv(csv_file)
            pre_done = len(df)

        prompts = get_all_prompts(symbol, min_past_weeks, max_past_weeks, with_basics)

        for i, prompt in enumerate(prompts):
            
            if i < pre_done:
                continue

            print(f"{symbol} - {i}")
            
            cnt = 0
            while cnt < 5:
                try:
                    completion = client.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system", "content": SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                          ]
                    )
                    print("==Generate answer successfully==")
                    break    
                except Exception:
                    cnt += 1
                    print(f'retry cnt {cnt}')
            
            answer = completion.choices[0].message.content if cnt < 5 else ""
            append_to_csv(csv_file, prompt, answer)
      

### Update for Unchanged Stock Price 
* In case of stock price staying unchanged at the begin and end of a week, we mapped it with a "股价持平" label.
* Don't worry if you generate data using the old version with "up0(上涨0) or down(0)下跌0". We will solve this in the following training data transformation function.

In [28]:
# # get the obtained list(should be update for stock “股价持平”)
# tic_list = [i[:6] for i in os.listdir("20230201_20240101")]
# tic_list

['000301',
 '000069',
 '000157',
 '000002',
 '.DS_St',
 '000063',
 '000166',
 '002594',
 '000333',
 '000001',
 '000100',
 '000338']

In [None]:
query_gpt4(HS_index[:], 1, 3)

# Formulate Training Data ...

In [8]:
with_basics = True
symbol = "000001"
csv_file = f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_gpt-4.csv' if with_basics else \
                f'{DATA_DIR}/{symbol}_{start_date}_{start_date}_nobasics_gpt-4.csv'

df = pd.read_csv(csv_file)

In [9]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def gpt4_to_llama_Chinese(symbol, with_basics=True):
    
    csv_file = f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_nobasics_gpt-4.csv'
    
    df = pd.read_csv(csv_file)
    
    prompts, answers, periods, labels = [], [], [], []
    
    for i, row in df.iterrows():
        
        prompt, answer = row['prompt'], row['answer']
        
        res = re.search(r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).*%|股价持平)。", prompt)
        try:
            period, label = res.group(1), res.group(2)
        except AttributeError:
            # set this to check for unchanged price(if the data formulation form has been updated, then this will be skipped)
            res = re.search(r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).)。", prompt)
            period, label = res.group(1), "股价持平"
        
        prompt = re.sub(
                    r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).*%|股价持平)。提供一个总结分析来支持你的预测。预测结果需要从你最后的分析中推断出来，因此不作为你分析的基础因素。", 
                    f"接下来请预测{symbol}下周({period})的股票价格。提供一个总结分析来支持你的预测。",
                    prompt
                )
        try:
            answer = re.sub(
                r"\[预测和分析\]：\n",
                f"[预测和分析]：\n预测：{label}\n分析: ",
                answer
            )
        except Exception:
            print(symbol, i)
            print(label)
            print(answer)
            continue
            
        new_system_prompt = SYSTEM_PROMPT.replace('：\n...', '\n预测：...\n分析：...')
        
        prompt = B_INST + B_SYS + new_system_prompt + E_SYS + prompt + E_INST
        
        prompts.append(prompt)
        answers.append(answer)
        periods.append(period)
        labels.append(label)
        
    return {
        "prompt": prompts,
        "answer": answers,
        "period": periods,
        "label": labels,
    }

def create_dataset(symbol_list, train_ratio=0.8, with_basics=True):

    train_dataset_list = []
    test_dataset_list = []

    for symbol in symbol_list:

        data_dict = gpt4_to_llama_Chinese(symbol, with_basics)
#         print(data_dict['prompt'][-1])
#         print(data_dict['answer'][-1])
        symbols = [symbol] * len(data_dict['label'])
        data_dict.update({"symbol": symbols})

        dataset = Dataset.from_dict(data_dict)
        train_size = round(train_ratio * len(dataset))

        train_dataset_list.append(dataset.select(range(train_size)))
        test_dataset_list.append(dataset.select(range(train_size, len(dataset))))

    train_dataset = datasets.concatenate_datasets(train_dataset_list)
    test_dataset = datasets.concatenate_datasets(test_dataset_list)

    dataset = datasets.DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return dataset

In [10]:
gpt4_to_llama_Chinese("000001")

{'prompt': ['[INST]<<SYS>>\n你是一个经验丰富的股票市场分析师。你的任务是根据过去几周的相关新闻和基本财务状况，列出公司的积极发展和潜在担忧，然后对公司未来一周的股价走势提供分析和预测。你的回答格式应该如下：\n\n[积极发展]：\n1. ...\n\n[潜在担忧]：\n1. ...\n\n[预测和分析]\n预测：...\n分析：...\n\n<</SYS>>\n\n[公司介绍]:\n\n平安银行是一家在银行行业的领先实体。自1970年01月01日成立并公开交易以来，该公司已确立其作为市场主要参与者之一的声誉。截止今天，平安银行的总市值为188237406520.59998人民币，总股本数为19405918198.0，流通市值为188233805415.0人民币，流通股数为19405546950.0。\n\n平安银行主要在中国运营，以股票代码000001在交易所进行交易。作为在银行行业的一个主导力量，该公司持续创新，不断推动行业进步。\n\n自2023-02-05至2023-02-12，000001的股票价格由2533.30下跌至2478.04。在此期间的公司新闻如下所列:\n\n[新闻标题]：平安银行成首家接入二代央评系统的全国性股份制银行\n[新闻内容]：三是支持人民银行以合格企业在金融机构的信贷资产为质押品进行再贷款管理。 “接入央评系统是平安银行开展央行普惠再贷业务必要条件，并且可以极大降低平安银行资金成本，助力平安银行进一步加大为中小微企业减费让利力度，反哺实体经济。”平安银行相关负责人认为，一方面，央行通过信贷资产质押再贷款向市场投放货币，有利于提高货币政策操作有效性和灵活性\n\n如下所列为000001近期的一些金融基本面信息，记录时间为2022-12-31:\n\n[金融基本面]:\n\n报告期: 2022-12-31\n净利润: 88.57亿\n净利润同比增长率: 23.00%\n扣非净利润: 88.10亿\n扣非净利润同比增长率: 21.72%\n营业总收入: 416.30亿\n营业总收入同比增长率: -1.33%\n基本每股收益: 0.4200\n每股净资产: 18.80\n每股资本公积金: 4.16\n每股未分配利润: 9.60\n每股经营现金流: 2.16\n销售净利率: 25.30%\n净资产收益率: 2.21%