In [2]:
import pandas as pd

df = pd.read_csv("../data/scored/politics_tasklevel_uniform1000.csv")

In [3]:
def get_prompt(paragraph: str, tags: str) -> str:
    return {
        "instruction": "An entity is a person (person), organization (organization), politician (politician), political party (politicalparty), event (event), election (election), country (country), location (location), or other political entity (misc). Dates, times, abstract concepts, adjectives, and verbs are not entities.\n\nFor each potential entity in the text, determine if it is an entity and, if so, its type. Provide the reason for your decision. Format your response as a YAML list, with each item containing the following fields:\n\nspan: The text span of the potential entity.\nentity_type: The type of the entity (person, organization, politician, politicalparty, event, election, country, location, misc) or false if not an entity.\nreason: A brief explanation of why the span is or is not an entity.",
        "context": paragraph,
        "response": tags
    }

In [4]:
import re
import yaml
from collections import OrderedDict
from collections import OrderedDict
import yaml
import json

def represent_ordereddict(dumper, data):
    value = []
    for item_key, item_value in data.items():
        node_key = dumper.represent_data(item_key)
        node_value = dumper.represent_data(item_value)
        value.append((node_key, node_value))
    return yaml.nodes.MappingNode(u'tag:yaml.org,2002:map', value)

yaml.add_representer(OrderedDict, represent_ordereddict)

def parse_text_to_json(text):
    try:
        lines = text.split('\n')
        yaml_data = []

        for line in lines:
            parts = line.split('|')
            if len(parts) == 3:
                span = parts[0].split('.')[1].strip()
                entity_type = parts[1].strip().lower() == 'true'
                reason = parts[2].strip()

                if entity_type:
                    entity_type_match = re.search(r'\((\w+)\)', reason)
                    if entity_type_match:
                        entity_type = entity_type_match.group(1)
                    else:
                        entity_type = 'misc'
                else:
                    entity_type = 'false'

                yaml_data.append(OrderedDict([
                    ('span', span),
                    ('entity_type', entity_type),
                    ('reason', reason)
                ]))
        return json.dumps(yaml_data)
    except:
        return "ERROR"

In [5]:
df['prompt'] = df.apply(lambda x: get_prompt(x["tokens"],parse_text_to_json(x["output"])),axis=1)
df["foutput"] = df['output'].apply(lambda x: parse_text_to_json(x))

In [6]:
df = df[df["foutput"].apply(len)>5] #for empty tags

In [8]:
df["inference_prompt"] = df.apply(lambda x:f"### INSTRUCTION: {x['prompt']['instruction']} ### PARAGRAPH: {x['prompt']['context']}  ### TAG_SPANS: ",axis=1)

In [9]:
df['prompt'].to_json('../data/1k_draft.jsonl', orient='records', lines=True)

In [None]:
!firectl create fine-tuning-job --settings-file ./config.yaml --display-name "My Job 3"