In [1]:
import psycopg2
import json
import pandas as pd

In [2]:
conn = psycopg2.connect(
    dbname="synthetic_data",
    user="airflow",
    password="airflow",
    host="localhost",
    port="5432"
)



cur = conn.cursor()


cur.execute(
"""
select sas.article_summary, ap.part_text, err.raw_relation_text, a.dataset, a.abstract, a.section_names 
from extracted_part_topics ept 
join extracted_relations_raw err on err.part_id = ept.part_id 
join article_parts ap on ap.part_id = ept.part_id 
join short_article_summary sas on sas.article_id = ept.article_summary_id 
join articles a on a.article_id = err.article_id
order by ept.part_id;
"""
)
rows = cur.fetchall()

cur.close()
conn.close()

In [3]:
pd.DataFrame(rows, columns=["general_text_summary", "text_part", "extracted_relations", "dataset", "abstract", "section_names"]).to_csv("generated_data/extracted_relations.csv", index=False)

In [4]:
pd.DataFrame([r[:3] for r in rows], columns=["general_text_summary", "text_part", "extracted_relations"]).to_csv("generated_data/core_extracted_relations.csv", index=False)

In [14]:
print(json.dumps(json.loads(pd.read_csv("generated_data/extracted_relations.csv").head(100)['extracted_relations'][20]), indent=4))

{
    "list_of_entities": [
        "reversals",
        "mateos",
        "figures",
        "rules",
        "current_reversal",
        "ensemble",
        "bifurcation",
        "jumps",
        "thumb",
        "spikes",
        "current",
        "particles",
        "open_question",
        "behavior",
        "heuristics",
        "direction",
        "chaotic",
        "parameter"
    ],
    "relations": [
        {
            "description": "bifurcations in single - trajectory behavior often corresponds to sudden spikes or jumps in the current for an ensemble in the same system",
            "source_entities": [
                "bifurcation"
            ],
            "target_entities": [
                "current"
            ]
        },
        {
            "description": "current reversals are a special case of this",
            "source_entities": [
                "current"
            ],
            "target_entities": [
                "bifurcation"
            ]
    

In [15]:
pd.read_csv("generated_data/core_extracted_relations.csv").head(10)

Unnamed: 0,general_text_summary,text_part,extracted_relations
0,The impact of a random phase diffuser on las...,@xmath62 denotes the rkhs generated by the st...,"{""list_of_entities"": [""learning rates"", ""nonpa..."
1,The impact of a random phase diffuser on las...,is an orthonormal basis of @xmath94 and @xmat...,"{""list_of_entities"": [""multivariate Gaussian k..."
2,The impact of a random phase diffuser on las...,", to be proved in section [ samplesection ] ....","{""list_of_entities"": [""additive models"", ""repr..."
3,The impact of a random phase diffuser on las...,@xmath170 has a unique @xmath159-quantile @xm...,"{""list_of_entities"": [""semiparametric statisti..."
4,The impact of a random phase diffuser on las...,"xmath230 for some @xmath231 , then for any @xm...","{""list_of_entities"": [""learning rates"", ""addit..."
5,The impact of a random phase diffuser on las...,if the function @xmath121 is given by ( [ gau...,"{""list_of_entities"": [""multivariate Gaussian k..."
6,The impact of a random phase diffuser on las...,", and @xmath278 is some user - defined positi...","{""list_of_entities"": [""multivariate Gaussian k..."
7,The article discusses the current reversal p...,the leptonic decays of a charged pseudoscalar ...,"{""list_of_entities"": [""statistical uncertainti..."
8,The article discusses the current reversal p...,calculation @xcite .\nthis disagreement is p...,"{""list_of_entities"": [""collisions"", ""discrepan..."
9,The article discusses the current reversal p...,xmath59 is required as a signature of @xmath60...,"{""list_of_entities"": [""statistical uncertainti..."


In [16]:

expected_json = """
{
  "$schema": "extraction_schema.json",
  "type": "object",
  "properties": {
    "section_description": {
      "type": "string"
    }
    "list_of_entities": {
      "type": "array",
      "items": {
        "type": "string"
      }
    },
    "relations": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "description": {
            "type": "string"
          },
          "source_entities": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "target_entities": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "strength": {
            "type": "string",
            "enum": ["strong", "moderate", "weak"]
          }
        },
        "required": ["description", "source_entities", "target_entities"]
      }
    },
    
  },
  "required": ["list_of_entities", "relations", "section_description"]
}
"""


def generate_prompt_with_summary_and_schema(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an summary and excerpt from an article. Your task is to extract information about entities and relations to the JSON format as follows:
```json-schema
{expected_json.strip()}
```
### General Text Summary:
{data_point["general_text_summary"]}
### Text Part to Extract From:
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""


def generate_prompt_with_summary(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an summary and excerpt from an article. Your task is to extract information about entities and relations to the JSON format.
### General Text Summary:
{data_point["general_text_summary"]}
### Text Part to Extract From:
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""
 

def generate_prompt_with_merged_text(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an part of larger text. Your task is to extract information about entities and relations to the JSON format.
### Text Part to Extract From:
{data_point["general_text_summary"]}
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""


def generate_prompt_with_merged_text_with_schema(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an part of larger text. Your task is to extract information about entities and relations to the JSON format as follows:
```json-schema
{expected_json.strip()}
```
### Text Part to Extract From:
{data_point["general_text_summary"]}
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""


def generate_prompt_no_summary_with_schema(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an part of larger text. Your task is to extract information about entities and relations to the JSON format as follows:
```json-schema
{expected_json.strip()}
```
### Text Part to Extract From:
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""


def generate_prompt_no_summary(data_point):

    extracted_relations = json.loads(data_point["extracted_relations"])
    reordered_json = {}

    for k in ["section_description", "list_of_entities", "relations"]:
        reordered_json[k] = extracted_relations[k]

    
    ordered_extracted_relations = json.dumps(reordered_json, indent=2)

    # ordered_extracted_relations = sorted(ordered_extracted_relations, key=lambda x: order[x["description"]])

    return f"""Below is an part of larger text. Your task is to extract information about entities and relations to the JSON format.
### Text Part to Extract From:
{data_point["text_part"]}
### Extracted Relations:
{ordered_extracted_relations}"""


In [17]:
def generate_all_prompt_variants(data_point):
    return [
        generate_prompt_with_summary_and_schema(data_point),
        generate_prompt_with_summary(data_point),
        generate_prompt_with_merged_text(data_point),
        generate_prompt_with_merged_text_with_schema(data_point),
        generate_prompt_no_summary_with_schema(data_point),
        generate_prompt_no_summary(data_point)
    ], ["prompt_with_summary_and_schema", "prompt_with_summary", "prompt_with_merged_text", "prompt_with_merged_text_with_schema", "prompt_no_summary_with_schema", "prompt_no_summary"]

In [20]:
parsed_data = pd.read_csv("generated_data/extracted_relations.csv")

all_prompts = []
all_prompt_names = []

for i, row in parsed_data.iterrows():
    prompts, prompt_names = generate_all_prompt_variants(row)
    all_prompts.append(prompts)
    all_prompt_names = prompt_names

pd.DataFrame(all_prompts, columns=all_prompt_names).to_csv("generated_data/llama2_prompts.csv", index=False)

In [21]:
pd.read_csv("generated_data/llama2_prompts.csv").head(20)[0]

KeyError: 0

In [30]:
print(generate_prompt_with_summary_and_schema(pd.read_csv("generated_data/core_extracted_relations.csv").head(21).iloc[20]))

Below is an summary and excerpt from an article. Your task is to extract information about entities and relations to the JSON format as follows:
```json-schema
{
  "$schema": "extraction_schema.json",
  "type": "object",
  "properties": {
    "section_description": {
      "type": "string"
    }
    "list_of_entities": {
      "type": "array",
      "items": {
        "type": "string"
      }
    },
    "relations": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "description": {
            "type": "string"
          },
          "source_entities": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "target_entities": {
            "type": "array",
            "items": {
              "type": "string"
            }
          },
          "strength": {
            "type": "string",
            "enum": ["strong", "moderate", "weak"]
          }
        },
        "