# Formulate Instruct-Tuning Data

## Generate Task-Response with GPT

In [1]:
from Ashare_data import *
import pandas as pd
import akshare as ak
import pandas as pd
import os
import csv
import re
import time
import math
import json
import random
from datasets import Dataset
import datasets
import math

In [2]:
from openai import OpenAI

# client = OpenAI(api_key = "YOURAPI")

start_date = "20230201"
end_date = "20240101"
DATA_DIR = f"./{start_date}_{end_date}_24Apr_qfq"
os.makedirs(DATA_DIR, exist_ok=True)

In [3]:
def append_to_csv(filename, input_data, output_data):
    
    with open(filename, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([input_data, output_data])

        
def initialize_csv(filename):
    
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["prompt", "answer"])

def query_gpt4(symbol_list, min_past_weeks=1, max_past_weeks=2, with_basics=True):

    for symbol in symbol_list:
        
        csv_file = f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_nobasics_gpt-4.csv'
        
        if not os.path.exists(csv_file):
            initialize_csv(csv_file)
            pre_done = 0
        else:
            df = pd.read_csv(csv_file)
            pre_done = len(df)

        prompts = get_all_prompts_new(symbol, min_past_weeks, max_past_weeks, with_basics)

        for i, prompt in enumerate(prompts):
            
            if i < pre_done:
                continue

            print(f"{symbol} - {i}")
            
            cnt = 0
            while cnt < 5:
                try:
                    completion = client.chat.completions.create(
                        model="gpt-4",
                        messages=[
                            {"role": "system", "content": SYSTEM_PROMPT},
                            {"role": "user", "content": prompt}
                          ]
                    )
                    print("==Generate answer successfully==")
                    break    
                except Exception:
                    cnt += 1
                    print(f'retry cnt {cnt}')
            
            answer = completion.choices[0].message.content if cnt < 5 else ""
            append_to_csv(csv_file, prompt, answer)

In [4]:
SZ50 = pd.read_excel("000016.SH-成分及权重-20240411.xlsx")
tickers = [tk[:6] for tk in SZ50['代码'].tolist()]

In [5]:
HS300 = pd.read_csv("HS300Index.csv", header=None).iloc[:,0:2]
HS300.columns = ['symbol', 'name']
HS300.symbol = HS300.symbol.apply(lambda x: "0"*(6-len(str(x)))+str(x))
HS_index = HS300.symbol.tolist()

In [6]:
for i in tickers:
    if i not in HS_index:
        print(i)

688041


In [7]:
tickers.remove("688041")

In [None]:
query_gpt4(tickers[:])

In [6]:
DATA_DIR = '20230201_20240101_24Apr_qfq'
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def gpt4_to_llama_Chinese_new(symbol, with_basics=True):
    
    csv_file = f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_gpt-4.csv' if with_basics else \
                   f'{DATA_DIR}/{symbol}_{start_date}_{end_date}_nobasics_gpt-4.csv'
    
    df = pd.read_csv(csv_file)
    
    prompts, answers, periods, labels = [], [], [], []
    
    for i, row in df.iterrows():
        
        prompt, answer = row['prompt'], row['answer']
        
        res = re.search(r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).*%|股价持平)。", prompt)
        try:
            period, label = res.group(1), res.group(2)
        except AttributeError:
            # set this to check for unchanged price(if the data formulation form has been updated, then this will be skipped)
            res = re.search(r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).)。", prompt)
            period, label = res.group(1), "股价持平"
        
        prompt = re.sub(
                    r"那么让我们假设你对于下一周\((.*)\)的预测是((:?上涨|下跌).*%|股价持平)。提供一个总结分析来支持你的预测。预测结果需要从你最后的分析中推断出来，因此不作为你分析的基础因素。", 
                    f"接下来请预测{symbol}下周({period})的股票涨跌幅，并提供一个总结分析来支持你的预测。",
                    prompt
                )
        try:
            answer = re.sub(
                r"\[预测和分析\]：\n",
                f"[预测和分析]：\n预测涨跌幅：{label}\n总结分析：",
                answer
            )
        except Exception:
            print(symbol, i)
            print(label)
            print(answer)
            continue
            
        new_system_prompt = SYSTEM_PROMPT.replace('：\n...', '：\n预测涨跌幅：...\n总结分析：...')
        
        prompt = B_INST + B_SYS + new_system_prompt + E_SYS + prompt + E_INST
        
        prompts.append(prompt)
        answers.append(answer)
        periods.append(period)
        labels.append(label)
        
    return {
        "prompt": prompts,
        "answer": answers,
        "period": periods,
        "label": labels,
    }

def create_dataset_new(symbol_list, train_ratio=0.8, with_basics=True):

    train_dataset_list = []
    test_dataset_list = []

    for symbol in symbol_list:

        data_dict = gpt4_to_llama_Chinese_new(symbol, with_basics)
#         print(data_dict['prompt'][-1])
#         print(data_dict['answer'][-1])
        symbols = [symbol] * len(data_dict['label'])
        data_dict.update({"symbol": symbols})

        dataset = Dataset.from_dict(data_dict)
        train_size = round(train_ratio * len(dataset))

        train_dataset_list.append(dataset.select(range(train_size)))
        test_dataset_list.append(dataset.select(range(train_size, len(dataset))))

    train_dataset = datasets.concatenate_datasets(train_dataset_list)
    test_dataset = datasets.concatenate_datasets(test_dataset_list)

    dataset = datasets.DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return dataset

In [16]:
dataset = create_dataset_new(tickers[:30])

In [17]:
index_name = "SZ50"
min_past_weeks = 1
max_past_weeks = 2
train_ratio = 0.8
dataset.save_to_disk(
    f"./SZdata0413/fingpt-forecaster-{index_name.lower()}-{start_date.replace('-', '')}-{end_date.replace('-', '')}-{min_past_weeks}-{max_past_weeks}-{str(train_ratio).replace('.', '')}"
    )

Saving the dataset (0/1 shards):   0%|          | 0/1110 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/270 [00:00<?, ? examples/s]