In [20]:
import json
import re
import os
import random
import datasets
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig

# Sentiment Datasets (following FinGPT v3)

### 1. FPB

In [2]:
dic = {
    0:"negative",
    1:'neutral',
    2:'positive',
}

In [3]:
# fpb_datasets = load_dataset("financial_phrasebank", "sentences_50agree")
fpb_datasets = load_from_disk('../data/financial_phrasebank-sentences_50agree/')
fpb_datasets = fpb_datasets["train"]
fpb_datasets = fpb_datasets.to_pandas()
fpb_datasets.columns = ["input", "output"]
fpb_datasets["output"] = fpb_datasets["output"].apply(lambda x:dic[x])
fpb_datasets["instruction"] = "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."
fpb_datasets = datasets.Dataset.from_pandas(fpb_datasets)
fpb_datasets = fpb_datasets.train_test_split(seed=42)['train']
fpb_datasets

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 3634
})

In [4]:
train_dataset = datasets.concatenate_datasets([fpb_datasets]*6)   # we want each data source have similar number of samples
train_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 21804
})

### 2. FiQA SA

In [5]:
def make_label(x):
    if x < - 0.1:
        return "negative"
    elif -0.1 <= x < 0.1:
        return "neutral"
    else:
        return "positive"

def add_instructions(x):
    if x == "post":
        return "What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}."
    else:
        return "What is the sentiment of this news? Please choose an answer from {negative/neutral/positive}."

In [6]:
# dataset = load_dataset('pauri32/fiqa-2018')
dataset = load_from_disk('../data/fiqa-2018/')
dataset = datasets.concatenate_datasets([dataset["train"], dataset["validation"] ,dataset["test"] ])
dataset = dataset.to_pandas()
dataset["output"] = dataset.sentiment_score.apply(make_label)
dataset["instruction"] = dataset.format.apply(add_instructions)
dataset = dataset[['sentence', 'output', "instruction"]]
dataset.columns = ["input", "output", "instruction"]
dataset = datasets.Dataset.from_pandas(dataset)
dataset = dataset.train_test_split(0.226, seed=42)['train']
dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 938
})

In [7]:
tmp_dataset = datasets.concatenate_datasets([dataset]*21)
train_dataset = datasets.concatenate_datasets([train_dataset, tmp_dataset]) 
print(tmp_dataset.num_rows)
train_dataset

19698


Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 41502
})

### 3. TFNS

In [8]:
dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

In [9]:
# social_media_dataset = load_dataset('zeroshot/twitter-financial-news-sentiment')
social_media_dataset = load_from_disk('../data/twitter-financial-news-sentiment')
social_media_dataset = social_media_dataset['train']
social_media_dataset = social_media_dataset.to_pandas()
social_media_dataset['label'] = social_media_dataset['label'].apply(lambda x:dic[x])
social_media_dataset['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
social_media_dataset.columns = ['input', 'output', 'instruction']
social_media_dataset = datasets.Dataset.from_pandas(social_media_dataset)
social_media_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 9543
})

In [10]:
tmp_dataset = datasets.concatenate_datasets([social_media_dataset]*2)
train_dataset = datasets.concatenate_datasets([train_dataset,tmp_dataset]) 
print(tmp_dataset.num_rows)
train_dataset

19086


Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 60588
})

### 4. NWGI

In [11]:
# finance_dataset = load_dataset('oliverwang15/news_with_gpt_instructions')
finance_dataset = load_from_disk('../data/news_with_gpt_instructions/')
finance_dataset = finance_dataset['train'].to_pandas()
finance_dataset['output'] = finance_dataset['label']
finance_dataset["input"] = finance_dataset["news"]
finance_dataset["instruction"] = 'What is the sentiment of this news? Please choose an answer from {strong negative/moderately negative/mildly negative/neutral/mildly positive/moderately positive/strong positive}.'
finance_dataset = finance_dataset[['input', 'output', 'instruction']]
finance_dataset = datasets.Dataset.from_pandas(finance_dataset)
finance_dataset

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 16184
})

In [12]:
train_dataset = datasets.concatenate_datasets([train_dataset, finance_dataset])
all_dataset = train_dataset.shuffle(seed=42)
all_dataset.shape

(76772, 3)

In [14]:
# from huggingface_hub import notebook_login
# notebook_login()

In [15]:
# all_dataset.push_to_hub("fingpt_chatglm2_sentiment_instruction_lora_ft_dataset")

In [13]:
21804 + 19698 + 19086 + 16184

76772

In [14]:
all_dataset.save_to_disk('fingpt-sentiment-train')

Saving the dataset (0/1 shards):   0%|          | 0/76772 [00:00<?, ? examples/s]

## CLS Ver. For Zero-shot Assessment

In [17]:
chatgpt_templates = [
    "What is the sentiment of the input {type} from financial perspective?",
    "Assign a sentiment category to this {type} related to finance.",
    "Categorize the input {type}'s emotional tone into one of three groups.",
    "Determine the sentiment expressed in the {type} from financial perspective.",
    "Characterize the {type}'s sentiment using the following options.",
]

with open('../benchmarks/sentiment_templates.txt', 'w') as f:
    f.writelines([l + '\n' for l in chatgpt_templates])


def option_list(ops_str):
    options = ops_str.split('/')
    random.shuffle(options)
    return ", ".join(options)


def map_func(feature):
    t = re.search(r'tweet|news', feature['instruction']).group()
    ops = option_list("negative/positive")
    if 'positive' in feature['output']:
        output = 'positive'
    elif 'negative' in feature['output']:
        output = 'negative' 
    else:
        output = 'neutral'
    return {"instruction": random.choice(chatgpt_templates).format(type=t) + "\nOptions: " + ops, "output": output}


In [18]:
random.seed(0)
all_dataset_instruct = all_dataset.map(map_func)
all_dataset_instruct = all_dataset_instruct.filter(lambda x: x['output'] != 'neutral')
all_dataset_instruct

Map:   0%|          | 0/76772 [00:00<?, ? examples/s]

Filter:   0%|          | 0/76772 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 47557
})

In [19]:
all_dataset_instruct.save_to_disk('fingpt-sentiment-cls-instruct')
all_dataset_instruct['instruction'][:10]

Saving the dataset (0/1 shards):   0%|          | 0/47557 [00:00<?, ? examples/s]

['Determine the sentiment expressed in the news from financial perspective.\nOptions: negative, positive',
 'Determine the sentiment expressed in the tweet from financial perspective.\nOptions: negative, positive',
 "Characterize the news's sentiment using the following options.\nOptions: negative, positive",
 "Characterize the news's sentiment using the following options.\nOptions: negative, positive",
 'What is the sentiment of the input tweet from financial perspective?\nOptions: positive, negative',
 'Determine the sentiment expressed in the tweet from financial perspective.\nOptions: negative, positive',
 "Categorize the input tweet's emotional tone into one of three groups.\nOptions: negative, positive",
 "Characterize the news's sentiment using the following options.\nOptions: positive, negative",
 'Determine the sentiment expressed in the news from financial perspective.\nOptions: negative, positive',
 'What is the sentiment of the input tweet from financial perspective?\nOptio

# Headline

In [21]:
df = pd.read_csv('gold-dataset-sinha-khandait.csv')
df.head()

Unnamed: 0,Dates,URL,News,Price or Not,Price Direction Up,Price Direction Constant,Price Direction Down,PastPrice,FuturePrice,PastNews,FutureNews,Asset Comparision,Price Sentiment
0,28-01-2016,http://www.marketwatch.com/story/april-gold-do...,"april gold down 20 cents to settle at $1,116.1...",1,0,0,1,1,0,0,0,0,negative
1,13-09-2017,http://www.marketwatch.com/story/gold-prices-s...,gold suffers third straight daily decline,1,0,0,1,1,0,0,0,0,negative
2,26-07-2016,http://www.marketwatch.com/story/gold-futures-...,Gold futures edge up after two-session decline,1,1,0,0,1,0,0,0,0,positive
3,28-02-2018,https://www.metalsdaily.com/link/277199/dent-r...,dent research : is gold's day in the sun comin...,0,0,0,0,0,0,0,1,0,none
4,06-09-2017,http://www.marketwatch.com/story/gold-steadies...,"Gold snaps three-day rally as Trump, lawmakers...",1,1,0,0,1,0,0,0,0,positive


In [22]:
train_dataset, test_dataset = {}, {}
inputs, outputs, instructions = [], [], []
for index, row in df.iterrows():
    
    if index + 1 >= len(df) * 0.8 and not train_dataset:
        train_dataset['input'] = inputs
        train_dataset['output'] = outputs
        train_dataset['instruction'] = instructions
        inputs, outputs, instructions = [], [], []
        
    inputs.extend([row['News']] * 9)
    # price or not
    instructions.append('Does the news headline talk about price? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['Price Direction Constant'] else 'No')
    # price up
    instructions.append('Does the news headline talk about price going up? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['Price Direction Up'] else 'No')
    # price stable
    instructions.append('Does the news headline talk about price staying constant? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['Price Direction Constant'] else 'No')
    # price down
    instructions.append('Does the news headline talk about price going down? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['Price Direction Down'] else 'No')
    # past price
    instructions.append('Does the news headline talk about price in the past? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['PastPrice'] else 'No')
    # future price
    instructions.append('Does the news headline talk about price in the future? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['FuturePrice'] else 'No')
    # past general
    instructions.append('Does the news headline talk about a general event (apart from prices) in the past? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['PastNews'] else 'No')
    # future general
    instructions.append('Does the news headline talk about a general event (apart from prices) in the future? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['FutureNews'] else 'No')
    # asset comparison
    instructions.append('Does the news headline compare gold with any other asset? Please choose an answer from {Yes/No}.')
    outputs.append('Yes' if row['Asset Comparision'] else 'No')
    
test_dataset['input'] = inputs
test_dataset['output'] = outputs
test_dataset['instruction'] = instructions

print(len(train_dataset['input']) // 9)
print(len(test_dataset['input']) // 9)


9129
2283


In [23]:
headline_dataset = DatasetDict({
    'train': Dataset.from_dict(train_dataset),
    'test': Dataset.from_dict(test_dataset)
})
headline_dataset.save_to_disk('fingpt-headline')
headline_dataset

Saving the dataset (0/1 shards):   0%|          | 0/82161 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20547 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 82161
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 20547
    })
})

## CLS Ver. for Zero-shot Training

In [24]:
chatgpt_templates = [
    "Does the news headline talk about {subject}? ",
    "Please determine if the news headline addresses {subject}.",
    "In the context of the news headline, is {subject} discussed?",
    "Is the news headline related to {subject}?",
    "Let me know if the news headline talks about {subject}.",
    "Consider the news headline - does it concern {subject}?",
    "Examine the news headline and decide if it includes {subject}.",
    "Assess if the news headline touches on {subject}.",
    "Review the news headline and determine if it relates to {subject}.",
    "Analyze the news headline for any mention of {subject}.",
    "Interpret the news headline to see if it mentions {subject}."
]

def yes_no():
    options = ['Yes', 'No']
    random.shuffle(options)
    return ", ".join(options)

def map_func(feature):
    match_res = re.search(r'talk about (.*)\? Please', feature['instruction'])
    subject = 'comparing gold with any other asset' if match_res is None else match_res.group(1)
    return {"instruction": random.choice(chatgpt_templates).format(subject=subject) + "\nOptions: " + yes_no()}
    

In [25]:
random.seed(0)
headline_dataset_instruct = headline_dataset.map(map_func)
headline_dataset_instruct.save_to_disk('fingpt-headline-cls-instruct')

Map:   0%|          | 0/82161 [00:00<?, ? examples/s]

Map:   0%|          | 0/20547 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82161 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20547 [00:00<?, ? examples/s]

In [26]:
headline_dataset_instruct['train']['instruction'][:10]

['Examine the news headline and decide if it includes price.\nOptions: Yes, No',
 'Does the news headline talk about price going up? \nOptions: Yes, No',
 'Review the news headline and determine if it relates to price staying constant.\nOptions: Yes, No',
 'Examine the news headline and decide if it includes price going down.\nOptions: Yes, No',
 'Assess if the news headline touches on price in the past.\nOptions: Yes, No',
 'Analyze the news headline for any mention of price in the future.\nOptions: No, Yes',
 'Review the news headline and determine if it relates to a general event (apart from prices) in the past.\nOptions: No, Yes',
 'Let me know if the news headline talks about a general event (apart from prices) in the future.\nOptions: No, Yes',
 'Please determine if the news headline addresses comparing gold with any other asset.\nOptions: Yes, No',
 'Review the news headline and determine if it relates to price.\nOptions: No, Yes']

# NER

In [28]:
# Read and parse the CoNLL-2003 formatted dataset

ent_dict = {
    'PER': 'person',
    'ORG': 'organization',
    'LOC': 'location',
}

def read_conll_file(file_path):
    sentences, tokens, labels = [], [], []
    with open(file_path, 'r') as f:
        sentence = []
        for line in f:
            if line.strip() == '':
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                parts = line.strip().split()
                token, label = parts[0], parts[-1]
                tokens.append(token)
                labels.append(label)
                sentence.append((token, label))
                
    return sentences
                

def get_ner_dataset(sentences):
    
    inputs, outputs, instructions = [], [], []
    count = {'person': 0, 'organization': 0, 'location': 0}
    for sentence in sentences:
        is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
        if sum(is_entity) == 0:
            continue
        instructions.append('Please extract entities and their types from the input sentence, entity types should be chosen from {person/organization/location}.')
        inputs.append(' '.join([tup[0] for tup in sentence]))
        outputs.append('')
        tmp_tup_list = []
        for i, tup in enumerate(sentence):
            if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
                entity = ' '.join([t[0] for t in tmp_tup_list])
                assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
                entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
                a = 'an' if entity_type == 'organization' else 'a'
                outputs[-1] += f'{entity} is {a} {entity_type}, ' 
                tmp_tup_list = [] if not is_entity[i] else [tup]
                count[entity_type] += 1
            elif is_entity[i]:
                tmp_tup_list.append(tup)
            else:
                pass
        outputs[-1] = outputs[-1].strip(', ') + '.'
    
    print(len(instructions))
    print(count)
        
    return {"input": inputs, "output": outputs, "instruction": instructions}

In [29]:
train_data = read_conll_file('./SEC-filings/CONLL-format/data/train/FIN5.txt')
test_data = read_conll_file('./SEC-filings/CONLL-format/data/test/FIN3.txt')

train_data = get_ner_dataset(train_data)
test_data = get_ner_dataset(test_data)

ner_dataset = DatasetDict({
    'train': Dataset.from_dict(train_data),
    'test': Dataset.from_dict(test_data)
})
ner_dataset.save_to_disk('fingpt-ner')
ner_dataset

511
{'person': 745, 'organization': 243, 'location': 168}
98
{'person': 216, 'organization': 56, 'location': 39}


Saving the dataset (0/1 shards):   0%|          | 0/511 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/98 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 511
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 98
    })
})

In [30]:
# def get_ner_dataset_1(sentences):
    
#     inputs, outputs, instructions = [], [], []
#     count = {'person': 0, 'organization': 0, 'location': 0}
#     for sentence in sentences:
#         is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
#         if sum(is_entity) == 0:
#             continue
#         instructions.append('Please list all entities in the input text that fit the following entity types: "person", "organization", "location". Output format is "type1: entity1; type2: entity2"')
#         instructions.append('Given options of entity types, please find all the entities associated with them in the input text. answer with format "entity1: type1; entity2: type2".\nOptions: "person", "organization", "location".')
#         inputs.append(' '.join([tup[0] for tup in sentence]))
#         inputs.append(' '.join([tup[0] for tup in sentence]))
#         outputs.append('')
#         outputs.append('')
#         tmp_tup_list = []
#         for i, tup in enumerate(sentence):
#             if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
#                 entity = ' '.join([t[0] for t in tmp_tup_list])
#                 assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
#                 entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
#                 outputs[-2] += f'{entity_type}: {entity}; ' 
#                 outputs[-1] += f'{entity}: {entity_type}; ' 
#                 tmp_tup_list = [] if not is_entity[i] else [tup]
#                 count[entity_type] += 1
#             elif is_entity[i]:
#                 tmp_tup_list.append(tup)
#             else:
#                 pass
#         outputs[-1] = outputs[-1].strip('; ')
#         outputs[-2] = outputs[-2].strip('; ')
    
#     print(len(instructions))
#     print(count)
        
#     return {"input": inputs, "output": outputs, "instruction": instructions}


# def get_ner_dataset_2(sentences):
    
#     inputs, outputs, instructions = [], [], []
#     count = {'person': 0, 'organization': 0, 'location': 0}
#     templates = [
#         'Does the input text include any entity of type "{entity_type}" ? If so, list them all, and answer with format "entity1; entity2". If not, Answer "No".',
#         'Please tell me all the entities in the text that belong to the given category "{entity_type}". Output format is "entity1; entity2". If no such entity can be found, answer "No".',
#     ]
#     for sentence in sentences:
#         is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
#         if sum(is_entity) == 0:
#             continue
#         for template in templates:
#             for tgt_entity_type in ['person', 'location', 'organization']:
#                 instructions.append(template.format(entity_type=tgt_entity_type))
#                 inputs.append(' '.join([tup[0] for tup in sentence]))
#                 outputs.append('')
#                 tmp_tup_list = []
#                 for i, tup in enumerate(sentence):
#                     if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
#                         entity = ' '.join([t[0] for t in tmp_tup_list])
#                         assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
#                         entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
#                         tmp_tup_list = [] if not is_entity[i] else [tup]
#                         if entity_type == tgt_entity_type:
#                             outputs[-1] += f'{entity}; ' 
#                             count[entity_type] += 1
#                     elif is_entity[i]:
#                         tmp_tup_list.append(tup)
#                     else:
#                         pass
#                 outputs[-1] = 'No' if not outputs[-1] else outputs[-1].strip('; ')
    
#     print(len(instructions))
#     print(count)
        
#     return {"input": inputs, "output": outputs, "instruction": instructions}


# def get_ner_dataset_3(sentences):
    
#     inputs, outputs, instructions = [], [], []
#     count = {'person': 0, 'organization': 0, 'location': 0}
#     for sentence in sentences:
#         is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
#         if sum(is_entity) == 0:
#             continue
#         prev_count = count.copy()
#         instructions.append('Please find all entities in the input text that fit the following entity types: "person", "organization", "location", then answer with the number counted for each type. Output format is "type1: number1; type2: number2"')
#         instructions.append('Given entity types as options, please find the number of occurrence for each entity type in the input text. answer with format "type1: number1; type2: number2".\nOptions: "person", "organization", "location".')
#         inputs.extend([' '.join([tup[0] for tup in sentence])] * 2)
#         tmp_tup_list = []
#         for i, tup in enumerate(sentence):
#             if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
#                 entity = ' '.join([t[0] for t in tmp_tup_list])
#                 assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
#                 entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
#                 tmp_tup_list = [] if not is_entity[i] else [tup]
#                 count[entity_type] += 1
#             elif is_entity[i]:
#                 tmp_tup_list.append(tup)
#             else:
#                 pass
        
#         per_cnt = count['person'] - prev_count['person']
#         loc_cnt = count['location'] - prev_count['location']
#         org_cnt = count['organization'] - prev_count['organization']
#         output_str = f'person: {per_cnt}; location: {loc_cnt}; organization: {org_cnt}'
#         outputs.extend([output_str] * 2)
    
#     print(len(instructions))
#     print(count)
        
#     return {"input": inputs, "output": outputs, "instruction": instructions}


In [31]:
# train_data = read_conll_file('./SEC-filings/CONLL-format/data/train/FIN5.txt')
# test_data = read_conll_file('./SEC-filings/CONLL-format/data/test/FIN3.txt')

# train_data_0 = get_ner_dataset(train_data)
# train_data_1 = get_ner_dataset_1(train_data)
# train_data_2 = get_ner_dataset_2(train_data)
# train_data_3 = get_ner_dataset_3(train_data)

# test_data_0 = get_ner_dataset(test_data)
# test_data_1 = get_ner_dataset_1(test_data)
# test_data_2 = get_ner_dataset_2(test_data)
# test_data_3 = get_ner_dataset_3(test_data)

# train_data = {k: train_data_0[k] + train_data_1[k] + train_data_2[k] + train_data_3[k] for k in train_data_0.keys()} 
# test_data = {k: test_data_0[k] + test_data_1[k] + test_data_2[k] + test_data_3[k] for k in test_data_0.keys()} 
# ner_dataset = DatasetDict({
#     'train': Dataset.from_dict(train_data),
#     'test': Dataset.from_dict(test_data)
# })
# ner_dataset.save_to_disk('fingpt-ner-full')
# ner_dataset

In [34]:
# chatgpt_templates = {
#     'Please list all entities in the input text that fit the following entity types: "person", "organization", "location". Output format is "type1: entity1; type2: entity2"':[
#         "Identify and compile all entities of types 'person,' 'organization,' and 'location' from the input text. Format the output as 'type1: entity1; type2: entity2.'",
#         "Gather all entities falling under the categories 'person,' 'organization,' and 'location' within the input text. Please use the 'type1: entity1; type2: entity2' format for the output.",
#         "Extract entities categorized as 'person,' 'organization,' or 'location' from the input text. Organize the results in the format 'type1: entity1; type2: entity2.'",
#         "Compile a list of entities matching 'person,' 'organization,' and 'location' entity types from the input text. Ensure the output format remains 'type1: entity1; type2: entity2.'",
#         "Detect and list all entities that fall under the types 'person,' 'organization,' and 'location' in the input text. Format the output as 'type1: entity1; type2: entity2.'",
#         "Enumerate entities of types 'person,' 'organization,' and 'location' from the input text. Use the output format 'type1: entity1; type2: entity2.'",
#         "Identify and present entities categorized as 'person,' 'organization,' and 'location' from the input text. Keep the output in the format 'type1: entity1; type2: entity2.'",
#         "Spot and document entities falling into the 'person,' 'organization,' and 'location' types within the input text. Keep the 'type1: entity1; type2: entity2' output format.",
#         "Extract entities matching 'person,' 'organization,' and 'location' types from the input text. Ensure the output follows the 'type1: entity1; type2: entity2' format.",
#         "Enumerate all entities that correspond to 'person,' 'organization,' and 'location' entity types within the input text. Format the output as 'type1: entity1; type2: entity2.'"
#     ],
#     'Given options of entity types, please find all the entities associated with them in the input text. answer with format "entity1: type1; entity2: type2".\nOptions: "person", "organization", "location".': [
#         "Using the provided options of entity types, locate all corresponding entities within the input text. Present the results in the format 'entity1: type1; entity2: type2.'",
#         "Find entities related to the given options of entity types within the input text. Respond with the format 'entity1: type1; entity2: type2.'",
#         "Identify entities associated with the provided entity type options in the input text. Format your response as 'entity1: type1; entity2: type2.'",
#         "Discover entities linked to the entity types listed in the options within the input text. Provide the output in the format 'entity1: type1; entity2: type2.'",
#         "Locate all entities corresponding to the provided entity type options in the input text. Present your findings as 'entity1: type1; entity2: type2.'",
#         "Search for entities related to the entity types listed in the options within the input text. Share the results in the 'entity1: type1; entity2: type2' format.",
#         "Identify entities associated with the given entity type options in the input text. Use the format 'entity1: type1; entity2: type2' for your response.",
#         "Retrieve entities connected to the entity types mentioned in the options from the input text. Format your answer as 'entity1: type1; entity2: type2.'",
#         "Examine the input text for entities that match the provided entity type options. Respond using the 'entity1: type1; entity2: type2' format.",
#         "Scour the input text for entities corresponding to the entity types in the options. Present the results in the 'entity1: type1; entity2: type2' format.",
#     ],
#     'Does the input text include any entity of type "{entity_type}" ? If so, list them all, and answer with format "entity1; entity2". If not, Answer "No".': [
#         "Check if the input text contains any entities of type '{entity_type}'. If found, list them all in the format 'entity1; entity2.' Otherwise, respond with 'No.'",
#         "Examine the input text for the presence of entities belonging to the '{entity_type}' type. If any are found, provide a list in the format 'entity1; entity2.' If none are found, answer 'No.'",
#         "Verify whether the input text contains entities of the type '{entity_type}.' If so, enumerate them in the format 'entity1; entity2.' If not, reply with 'No.'",
#         "Determine if the input text includes any entities categorized as '{entity_type}.' If it does, present them in the format 'entity1; entity2.' In case of none, respond with 'No.'",
#         "Assess whether the input text encompasses entities of the '{entity_type}' type. If there are any, list them as 'entity1; entity2.' If there are none, respond with 'No.'",
#         "Inspect the input text to find out if it contains any entities labeled as '{entity_type}.' If it does, provide a list in the format 'entity1; entity2.' If not, reply with 'No.'",
#         "Ascertain whether the input text possesses entities falling under the '{entity_type}' category. If it does, compile them in the format 'entity1; entity2.' If it doesn't, answer 'No.'",
#         "Confirm whether the input text harbors entities of type '{entity_type}.' If it does, list them all as 'entity1; entity2.' If it doesn't, respond with 'No.'",
#         "Investigate whether the input text features any entities of the '{entity_type}' variety. If any are present, document them in the format 'entity1; entity2.' Otherwise, answer 'No.'",
#         "Scrutinize the input text to determine the presence of entities belonging to the '{entity_type}' category. If any are found, report them as 'entity1; entity2.' If none are found, respond with 'No.'"
#     ],
#     'Please tell me all the entities in the text that belong to the given category "{entity_type}". Output format is "entity1; entity2". If no such entity can be found, answer "No".': [
#         "Identify and provide all entities within the text that fall under the specified category '{entity_type}'. Format the output as 'entity1; entity2.' If no such entity exists, respond with 'No.'",
#         "Locate and list all entities in the text that are categorized as '{entity_type}'. Use the output format 'entity1; entity2.' If there are no entities of this type, answer 'No.'",
#         "Find and present all entities within the text that belong to the designated category '{entity_type}'. Ensure the output maintains the format 'entity1; entity2.' If there are none, reply with 'No.'",
#         "Discover and enumerate all entities in the text that are classified as '{entity_type}'. Format the output as 'entity1; entity2.' In the absence of such entities, respond with 'No.'",
#         "Search for and compile all entities in the text that pertain to the specified category '{entity_type}'. Use the 'entity1; entity2' format for the output. If none are found, answer 'No.'",
#         "Identify and gather all entities within the text that fit the given category '{entity_type}'. Present the results in the format 'entity1; entity2.' If there are no entities of this type, reply with 'No.'",
#         "Detect and list all entities in the text that are associated with the provided category '{entity_type}'. Maintain the output format as 'entity1; entity2.' If none are found, respond with 'No.'",
#         "Spot and document all entities within the text that are aligned with the specified category '{entity_type}'. Use the format 'entity1; entity2.' In the event of no such entities, answer 'No.'",
#         "Examine and compile all entities in the text that fall under the given category '{entity_type}'. Keep the output in the 'entity1; entity2' format. If there are no entities of this type, reply with 'No.'",
#         "Check and provide all entities in the text that match the category '{entity_type}'. Format the output as 'entity1; entity2.' If there are no entities of this type, answer 'No.'",
#     ],
#     'Please find all entities in the input text that fit the following entity types: "person", "organization", "location", then answer with the number counted for each type. Output format is "type1: number1; type2: number2"': [
#         "Identify all entities within the input text falling under the specified entity types: 'person,' 'organization,' and 'location.' Report the counts for each type in the format 'type1: number1; type2: number2.'",
#         "Detect entities of the 'person,' 'organization,' and 'location' types in the input text. Provide the respective counts for each type as 'type1: number1; type2: number2.'",
#         "Locate entities categorized as 'person,' 'organization,' and 'location' within the input text. Deliver the counts for each type in the 'type1: number1; type2: number2' format.",
#         "Search for entities that match the entity types 'person,' 'organization,' and 'location' in the input text. Present the counts for each type in the format 'type1: number1; type2: number2.'",
#         "Gather entities belonging to the specified types 'person,' 'organization,' and 'location' from the input text. Respond with the counts for each type in the 'type1: number1; type2: number2' format.",
#         "Examine the input text for entities of the 'person,' 'organization,' and 'location' categories. Report the respective counts for each type as 'type1: number1; type2: number2.'",
#         "Check for entities falling under 'person,' 'organization,' and 'location' types in the input text. Provide the counts for each type in the format 'type1: number1; type2: number2.'",
#         "Assess the input text for entities of types 'person,' 'organization,' and 'location.' Share the counts for each type in the 'type1: number1; type2: number2' format.",
#         "Inspect the input text to find entities categorized as 'person,' 'organization,' and 'location.' Communicate the counts for each type using the format 'type1: number1; type2: number2.'",
#         "Identify entities matching 'person,' 'organization,' and 'location' types within the input text. Provide the counts for each type in the 'type1: number1; type2: number2' format."
#     ],
#     'Given entity types as options, please find the number of occurrence for each entity type in the input text. answer with format "type1: number1; type2: number2".\nOptions: "person", "organization", "location".': [
#         "Using the provided entity type options, determine the count of each entity type within the input text. Respond in the format 'type1: number1; type2: number2.'",
#         "Calculate the occurrences of each entity type listed in the options within the input text. Format the answer as 'type1: number1; type2: number2.'",
#         "Count the instances of each entity type mentioned in the options within the input text. Provide the counts in the format 'type1: number1; type2: number2.'",
#         "Find and tally the number of occurrences for each entity type from the given options in the input text. Answer using the format 'type1: number1; type2: number2.'",
#         "Examine the input text and enumerate the occurrences of each entity type specified in the options. Report the counts in the format 'type1: number1; type2: number2.'",
#         "Search for and calculate the occurrences of each entity type included in the options within the input text. Present the results in the format 'type1: number1; type2: number2.'",
#         "Identify and count the number of times each entity type from the provided options appears in the input text. Share the counts using the format 'type1: number1; type2: number2.'",
#         "Inspect the input text for occurrences of each entity type mentioned in the options. Report the counts in the 'type1: number1; type2: number2' format.",
#         "Confirm the occurrences of each entity type listed in the options within the input text. Present the counts as 'type1: number1; type2: number2.'",
#         "Gather the occurrences of each entity type specified in the options within the input text. Format the response as 'type1: number1; type2: number2.'",
#     ],
# }

In [35]:
# def get_augmented_dataset(data):
#     instructions, inputs, outputs = [], [], []
#     for i, ins in enumerate(data['instruction']):
#         if ins not in chatgpt_templates:
#             entity_type = re.search(r'location|organization|person', ins).group()
#             ins = ins.replace(entity_type, '{entity_type}')
#             assert ins in chatgpt_templates
#         for ins_template in chatgpt_templates[ins]:
#             if '{entity_type}' in ins_template:
#                 ins_template = ins_template.replace('{entity_type}', entity_type)
#             instructions.append(ins_template)
#             inputs.append(data['input'][i])
#             outputs.append(data['output'][i])
#     return {
#         'instruction': data['instruction'] + instructions,
#         'input': data['input'] + inputs,
#         'output': data['output'] + outputs
#     }
            

In [36]:
# train_data_aug_1 = get_augmented_dataset(train_data_1)
# train_data_aug_2 = get_augmented_dataset(train_data_2)
# train_data_aug_3 = get_augmented_dataset(train_data_3)

# test_data_aug_1 = get_augmented_dataset(test_data_1)
# test_data_aug_2 = get_augmented_dataset(test_data_2)
# test_data_aug_3 = get_augmented_dataset(test_data_3)



In [33]:
# train_instruct_data = {k: train_data_aug_1[k] + train_data_aug_2[k] + train_data_aug_3[k] for k in train_data_aug_1.keys()} 
# test_instruct_data = {k: test_data_aug_1[k] + test_data_aug_2[k] + test_data_aug_3[k] for k in test_data_aug_1.keys()} 

# ner_instruct_dataset = DatasetDict({
#     'train': Dataset.from_dict(train_instruct_data),
#     'test': Dataset.from_dict(test_instruct_data)
# })
# ner_instruct_dataset.save_to_disk('fingpt-ner-full-instruct')
# ner_instruct_dataset
# # train_data_aug_2['instruction'][-30:]

## CLS Ver. for Zero-shot Training

In [37]:
chatgpt_templates = [
    "What is the entity type of '{entity}' in the input sentence.",
    "With the input text as context, identify the entity type of '{entity}'.",
    "Using the input sentence as a reference, analyze and specify the entity type of '{entity}'.",
    "In the context of the input sentence, examine and categorize the entity type of '{entity}'.",
    "Utilize the input text as context to explore and ascertain the entity type of '{entity}'.",
    "Leverage the input sentence to evaluate and define the entity type for '{entity}'.",
    "Considering the input sentence as context, inspect and classify the entity type of '{entity}'.",
    "With the input sentence as a backdrop, scrutinize and determine the entity type of '{entity}'.",
    "Interpreting the input sentence as context, specify the entity type for '{entity}'.",
    "Assessing the input sentence as context, label the entity type of '{entity}'.",
    "In the input sentence, determine the entity type for '{entity}'.",
    "Within the input text, identify the entity type of '{entity}'.",
    "Analyze the input sentence to find the entity type of '{entity}'.",
    "Check the input sentence for the entity type associated with '{entity}'.",
    "Explore the input sentence to ascertain the entity type of '{entity}'.",
    "Examine the input text to classify the entity type of '{entity}'.",
    "Scrutinize the input sentence to define the entity type of '{entity}'."
]

In [38]:
def entities():
    options = ['location', 'organization', 'person']
    random.shuffle(options)
    return ", ".join(options)


def get_ner_dataset_4(sentences):
    
    inputs, outputs, instructions = [], [], []
    for sentence in sentences:
        is_entity = [tup[1] != 'O' and not tup[1].endswith('MISC') for tup in sentence]
        if sum(is_entity) == 0:
            continue
        tmp_tup_list = []
        entity_dict = {}
        for i, tup in enumerate(sentence):
            if tmp_tup_list and (not is_entity[i] or tmp_tup_list[-1][1] != tup[1] or i + 1 == len(sentence)):
                entity = ' '.join([t[0] for t in tmp_tup_list])
                assert tmp_tup_list[0][1] == tmp_tup_list[-1][1], tmp_tup_list
                entity_type = ent_dict[tmp_tup_list[-1][1].split('-')[-1]]
                tmp_tup_list = [] if not is_entity[i] else [tup]
                if entity not in entity_dict:
                    entity_dict[entity] = entity_type
                elif entity_dict[entity] != entity_type:
                    entity_dict[entity] = ""
                else:
                    pass
            elif is_entity[i]:
                tmp_tup_list.append(tup)
            else:
                pass
        # print(entity_dict)
        for k, v in entity_dict.items():
            # instructions.extend([t.format(entity=entity, options=entities()) for t in chatgpt_templates])
            instructions.extend([t.format(entity=entity) + '\nOptions: ' + entities() for t in chatgpt_templates])
            inputs.extend([' '.join([tup[0] for tup in sentence])] * len(chatgpt_templates))
            outputs.extend([entity_type] * len(chatgpt_templates))
                
    print(len(instructions))
        
    return {"input": inputs, "output": outputs, "instruction": instructions}


In [39]:
train_data = read_conll_file('./SEC-filings/CONLL-format/data/train/FIN5.txt')
test_data = read_conll_file('./SEC-filings/CONLL-format/data/test/FIN3.txt')

train_data_cls = get_ner_dataset_4(train_data)
test_data_cls = get_ner_dataset_4(test_data)

ner_cls_instruct_dataset = DatasetDict({
    'train': Dataset.from_dict(train_data_cls),
    'test': Dataset.from_dict(test_data_cls)
})
ner_cls_instruct_dataset.save_to_disk('fingpt-ner-cls-instruct')
ner_cls_instruct_dataset


13549
3502


Saving the dataset (0/1 shards):   0%|          | 0/13549 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3502 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 13549
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 3502
    })
})

In [40]:
ner_cls_instruct_dataset['train']['instruction'][:10]

["What is the entity type of '40 William St' in the input sentence.\nOptions: person, location, organization",
 "With the input text as context, identify the entity type of '40 William St'.\nOptions: organization, person, location",
 "Using the input sentence as a reference, analyze and specify the entity type of '40 William St'.\nOptions: organization, location, person",
 "In the context of the input sentence, examine and categorize the entity type of '40 William St'.\nOptions: location, organization, person",
 "Utilize the input text as context to explore and ascertain the entity type of '40 William St'.\nOptions: organization, person, location",
 "Leverage the input sentence to evaluate and define the entity type for '40 William St'.\nOptions: person, organization, location",
 "Considering the input sentence as context, inspect and classify the entity type of '40 William St'.\nOptions: organization, location, person",
 "With the input sentence as a backdrop, scrutinize and determine

# FinRED

In [47]:
with open('FinRED/relations.txt') as f:
    relations = [r.strip() for r in f.readlines()]

    
def get_instruction(sent, tuples, with_orig=True, with_cls=False):
    
    instructions, inputs, outputs = [], [], []
    if with_orig:
        instructions.append(f"Given phrases that describe the relationship between two words/phrases as options, extract the word/phrase pair and the corresponding lexical relationship between them from the input text. The output format should be \"relation1: word1, word2; relation2: word3, word4\". Options: {', '.join(relations)}.")
        instructions.append(f"Given the input sentence, please extract the subject and object containing a certain relation in the sentence according to the following relation types, in the format of \"relation1: word1, word2; relation2: word3, word4\". Relations include: {'; '.join(relations)}.")
        inputs.extend([sent] * 2)
        outputs.extend(["; ".join([f"{tup[-1]}: {tup[0]}, {tup[1]}" for tup in tuples])] * 2)
    
    if with_cls:
        for tup in tuples:
            instructions.append(f"Utilize the input text as a context reference, choose the right relationship between {tup[0]} and {tup[1]} from the options. Options: {', '.join(relations)}.")
            instructions.append(f"What is the relationship between {tup[0]} and {tup[1]} in the context of the input sentence. Choose an answer from: {'; '.join(relations)}.")
            inputs.extend([sent] * 2)
            outputs.extend([tup[-1]] * 2)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file, with_orig, with_cls):

    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples, with_orig, with_cls)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [48]:
train_dataset = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup', with_orig=True, with_cls=True)
test_dataset = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup', with_orig=True, with_cls=True)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred')
finred_dataset

Saving the dataset (0/1 shards):   0%|          | 0/27558 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5112 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 27558
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 5112
    })
})

In [49]:
train_dataset = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup', with_orig=True, with_cls=False)
test_dataset = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup', with_orig=True, with_cls=False)

finred_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

finred_dataset.save_to_disk('fingpt-finred-re')
finred_dataset

Saving the dataset (0/1 shards):   0%|          | 0/11400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2136 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 11400
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 2136
    })
})

## CLS Ver. for Zero-shot Training

In [50]:
with open('FinRED/relations.txt') as f:
    all_relations = [r.strip() for r in f.readlines()]


def get_instruction(sent, tuples):
    
    instructions, inputs, outputs = [], [], []
    for tup in tuples:        
        output = tup[-1].replace('_', ' ').replace(' / ', '/').replace(' or ', '/')
        relations = all_relations.copy()
        relations.remove(output)
        random.shuffle(relations)
        relations = relations[:3] + [output]
        random.shuffle(relations)
        instructions.append(f"Utilize the input text as a context reference, choose the right relationship between '{tup[0]}' and '{tup[1]}' from the options.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Refer to the input text as context and select the correct relationship between '{tup[0]}' and '{tup[1]}' from the available options.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Take context from the input text and decide on the accurate relationship between '{tup[0]}' and '{tup[1]}' from the options provided.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"What is the relationship between '{tup[0]}' and '{tup[1]}' in the context of the input sentence.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"In the context of the input sentence, determine the relationship between '{tup[0]}' and '{tup[1]}'.\nOptions: {', '.join(relations)}")
        random.shuffle(relations)
        instructions.append(f"Analyze the relationship between '{tup[0]}' and '{tup[1]}' within the context of the input sentence.\nOptions: {', '.join(relations)}")
        inputs.extend([sent] * 6)
        outputs.extend([output] * 6)
    
    return instructions, inputs, outputs


def get_finred_dataset(sent_file, tup_file):
    
    random.seed(0)
    instructions, inputs, outputs = [], [], []

    with open(sent_file) as f:
        sentences = [s.strip() for s in f.readlines()]
    with open(tup_file) as f:
        tuples_list = [s.split(' | ') for s in f.readlines()]
        
    for sent, tuples in zip(sentences, tuples_list):
        tuples = [[e.strip() for e in tup.split(' ; ')] for tup in tuples]
        
        ins, i, o = get_instruction(sent, tuples)
        
        instructions.extend(ins)
        inputs.extend(i)
        outputs.extend(o)
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })

In [51]:
train_dataset_instruct = get_finred_dataset('FinRED/train.sent', 'FinRED/train.tup')
test_dataset_instruct = get_finred_dataset('FinRED/test.sent', 'FinRED/test.tup')

finred_dataset_instruct = DatasetDict({
    'train': train_dataset_instruct,
    'test': test_dataset_instruct
})

finred_dataset_instruct.save_to_disk('fingpt-finred-cls-instruct')
finred_dataset_instruct

Saving the dataset (0/1 shards):   0%|          | 0/48474 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/8928 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 48474
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 8928
    })
})

In [52]:
finred_dataset_instruct['train']['instruction'][:10]

["Utilize the input text as a context reference, choose the right relationship between 'Apple Inc' and 'Steve Jobs' from the options.\nOptions: industry, founded by, owner of, currency",
 "Refer to the input text as context and select the correct relationship between 'Apple Inc' and 'Steve Jobs' from the available options.\nOptions: industry, currency, owner of, founded by",
 "Take context from the input text and decide on the accurate relationship between 'Apple Inc' and 'Steve Jobs' from the options provided.\nOptions: industry, currency, owner of, founded by",
 "What is the relationship between 'Apple Inc' and 'Steve Jobs' in the context of the input sentence.\nOptions: currency, founded by, owner of, industry",
 "In the context of the input sentence, determine the relationship between 'Apple Inc' and 'Steve Jobs'.\nOptions: industry, founded by, owner of, currency",
 "Analyze the relationship between 'Apple Inc' and 'Steve Jobs' within the context of the input sentence.\nOptions: c

# ConvFinQA

In [41]:
def get_confinqa_dataset(json_file):

    instructions, inputs, outputs = [], [], []

    instruction = 'Read the following texts and table with financial data from an S&P 500 earnings report carefully.' \
    'Based on the question-answer history (if provided), answer the last question. ' \
    'The answer may require mathematical calculation based on the data provided.\n'

    samples = json.load(open(json_file))
    for sample in samples:
        annos = sample['annotation']
        pre_text, post_text = annos['amt_pre_text'], annos['amt_post_text']
        table = annos['amt_table'].replace('<td></td>', '<td>-</td>')
        context = f'{pre_text} {table} {post_text}\n'
        questions, answers, turn_ind = annos['dialogue_break'], annos['exe_ans_list'], annos['turn_ind']
        for i in range(turn_ind):
            context += f'Question: {questions[i]}\n'
            context += f'Answer: {answers[i]}\n'
        context += f'Question: {questions[turn_ind]}\n'
        outputs.append(str(answers[turn_ind]))
        instructions.append(instruction)
        inputs.append(context)
        
    return Dataset.from_dict({
        'input': inputs,
        'output': outputs,
        'instruction': instructions
    })
    

In [42]:
train_file = 'ConvFinQA/data/train_turn.json'
test_file = 'ConvFinQA/data/dev_turn.json'

train_dataset = get_confinqa_dataset(train_file)
test_dataset = get_confinqa_dataset(test_file)

convfinqa_dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

convfinqa_dataset.save_to_disk('fingpt-convfinqa')
convfinqa_dataset

Saving the dataset (0/1 shards):   0%|          | 0/11104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 11104
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 1490
    })
})

In [43]:
convfinqa_dataset['train'][9]

{'input': 'in a new business model such as the retail segment is inherently risky , particularly in light of the significant investment involved , the current economic climate , and the fixed nature of a substantial portion of the retail segment\'s operating expenses . results for this segment are dependent upon a number of risks and uncertainties , some of which are discussed below under the heading "factors that may affect future results and financial condition." backlog in the company\'s experience , the actual amount of product backlog at any particular time is not a meaningful indication of its future business prospects . in particular , backlog often increases in anticipation of or immediately following new product introductions because of over- ordering by dealers anticipating shortages . backlog often is reduced once dealers and customers believe they can obtain sufficient supply . because of the foregoing , backlog cannot be considered a reliable indicator of the company\'s ab

# FinEval

In [44]:
csv_list = glob('FinEval/val/*.csv') + glob('FinEval/dev/*.csv')
subject_mapping = json.load(open('FinEval/subject_mapping.json'))

instructions, inputs, outputs = [], [], []

for csv_file in csv_list:
    subject = subject_mapping[csv_file.split('/')[-1][:-8]][1]
    df = pd.read_csv(csv_file)
    for index, row in df.iterrows():
        instructions.append(f'以下是中国关于{subject}考试的单项选择题，请选出其中的正确答案。')
        inputs.append(f"{row['question']}\nA. {row['A']}\nB. {row['B']}\nC. {row['C']}\nD. {row['D']}\n")
        outputs.append(f"{row['answer']}. {row[row['answer']]}")
        
fineval_dataset = Dataset.from_dict({
    'input': inputs,
    'output': outputs,
    'instruction': instructions
})
fineval_dataset = fineval_dataset.train_test_split(0.2, seed=42)
fineval_dataset.save_to_disk('fingpt-fineval')
fineval_dataset
    

Saving the dataset (0/1 shards):   0%|          | 0/1056 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/265 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 1056
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 265
    })
})

In [45]:
fineval_dataset['train'][3]

{'input': '研究社会资本再生产的出发点是____。\nA. 货币资本\nB. 生产资本\nC. 流通资本\nD. 社会总产品\n',
 'output': 'D. 社会总产品',
 'instruction': '以下是中国关于政治经济学考试的单项选择题，请选出其中的正确答案。'}

# FiQA QA

In [53]:
docs = pd.read_csv('FiQA_train/FiQA_train_doc_final.tsv', sep='\t')
questions = pd.read_csv('FiQA_train/FiQA_train_question_final.tsv', sep='\t')
qa_pairs = pd.read_csv('FiQA_train/FiQA_train_question_doc_final.tsv', sep='\t')

In [54]:
doc_dict, question_dict = {}, {}
for i, row in docs.iterrows():
    doc_dict[row['docid']] = row['doc']
for i, row in questions.iterrows():
    question_dict[row['qid']] = row['question']
    
instruction_templates = [
    "Utilize your financial knowledge, give your answer or opinion to the input question or subject . Answer format is not limited.",
    "Offer your insights or judgment on the input financial query or topic using your financial expertise. Reply as normal question answering",
    "Based on your financial expertise, provide your response or viewpoint on the given financial question or topic. The response format is open.",
    "Share your insights or perspective on the financial matter presented in the input.",
    "Offer your thoughts or opinion on the input financial query or topic using your financial background."
]

inputs, outputs, instructions = [], [], []
for i, row in qa_pairs.iterrows():
    qid, docid = row['qid'], row['docid']
    q = str(question_dict[qid])
    doc = str(doc_dict[docid])
    inputs.append(q)
    outputs.append(doc)
    instructions.append(instruction_templates[i%5])

fiqa_qa_dataset = Dataset.from_dict({
    'input': inputs,
    'output': outputs,
    'instruction': instructions
})
fiqa_qa_dataset.save_to_disk('fingpt-fiqa_qa')
fiqa_qa_dataset

Saving the dataset (0/1 shards):   0%|          | 0/17110 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'instruction'],
    num_rows: 17110
})