In [1]:
import json
import glob
import os

def get_paths(input_folder):
    """
    Get a list of all .json file paths in the input folder.
    :param input_folder: Folder containing .json files.
    :return: List of file paths.
    """
    list_files = glob.glob(input_folder + '/*.json')
    return list_files

def load_text(json_path):
    """
    Load and parse JSON content from a given file.
    :param json_path: Path to the JSON file.
    :return: Parsed JSON content.
    """
    with open(json_path, 'r', encoding='utf-8') as json_file:
        data = json_file.read()
        content = json.loads(data)
    
    return content

def process_and_write(loaded_dicts, output_folder, json_path):
    """
    Process the loaded JSON data and write it to a .conll formatted file.
    :param loaded_dicts: Parsed content of the JSON file.
    :param output_folder: Folder to save the .conll files.
    :param json_path: Path of the original JSON file.
    """
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Create the output file name with .conll extension
    base_name = os.path.basename(json_path)[:-5]  # Remove .json extension
    conll_filename = base_name + '.txt'
    output_path = os.path.join(output_folder, conll_filename)
    
    # Write to the output file in CoNLL format
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in loaded_dicts:
            for annotation in item.get('annotations', []):
                for result in annotation.get('result', []):
                    value = result.get('value', {})
                    text = value.get('text', '')
                    label = ','.join(value.get('hypertextlabels', []))
                    if text and label:
                        f.write(f"{text} {label}\n")
                f.write("\n")  # Separate different annotations with a new line

def convert_json_to_conll(input_folder, output_folder):
    """
    Main function to convert all JSON files in the input folder to CoNLL format files.
    :param input_folder: Folder containing .json files.
    :param output_folder: Folder where .conll files will be saved.
    """
    json_paths = get_paths(input_folder)
    
    for json_path in json_paths:
        loaded_dicts = load_text(json_path)
        process_and_write(loaded_dicts, output_folder, json_path)

# Specify the input folder containing JSON files and the output folder for CoNLL files
input_folder = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'
output_folder = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties_info_data'

# Convert all JSON files in the input folder to CoNLL format
convert_json_to_conll(input_folder, output_folder)


In [2]:
def process(loaded_dicts, output_folder, json_path):

    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Create the output file name with .conll extension
    base_name = os.path.basename(json_path)[:-5]  # Remove .json extension
    conll_filename = base_name + '.txt'
    output_path = os.path.join(output_folder, conll_filename)
    
    # Write to the output file in CoNLL format
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in loaded_dicts:
            for annotation in item.get('annotations', []):
                for result in annotation.get('result', []):
                    value = result.get('value', {})
                    text = value.get('text', '')
                    label = ','.join(value.get('hypertextlabels', []))
                    if text and label:
                        f.write(f"{text} \n")
                f.write("\n")  # Separate different annotations with a new line

def convert_json_to_text(input_folder, output_folder):
    json_paths = get_paths(input_folder)
    
    for json_path in json_paths:
        loaded_dicts = load_text(json_path)
        process(loaded_dicts, output_folder, json_path)

# Specify the input folder containing JSON files and the output folder for CoNLL files
input_folder = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties information'
output_folder = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties_info_text'

# Convert all JSON files in the input folder to CoNLL format
convert_json_to_text(input_folder, output_folder)


In [7]:
import os
import re
import json
from sklearn.metrics import f1_score
from together import Together

# Define the one-shot example for each entity label
one_shot_prompt = """
You are an AI tasked with identifying specific entities in a text. Below is an example for each entity type with labeled entities, followed by a new text where you will need to identify entities based on the example.

Example:
Input: "BRIGHT HORIZONS CAPITAL CORP"
Output: Organization Name

Input: "Swing Line Lender"
Output: Organization Role

Input: "legal counsel"
Output: Organization Sub-Role

Input: "Don R. Madison"
Output: Person Name

Input: "Senior Vice President"
Output: Person Position

Input: "40 Wantage Avenue, Branchville, New Jersey 07890"
Output: Location

Input: "corporate headquarters"
Output: Location Type

New Text:
{input_text}

Please extract the entities from the 'New Text' and label them in the following format:
- "Entity" "Label"

Labels:
· Organization Name
· Organization Role
· Organization Sub-Role
· Location
· Location Type
· Person Name
· Person Position
"""

# Read the text file where each line is a text to be labeled
input_file_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties_info_text\test.txt'
output_file_path = r'C:\Users\archishman vb\OneDrive\Desktop\predicted_labels.txt'
ground_truth_file_path = r'C:\Users\archishman vb\OneDrive\Desktop\annotated\parties_info_data\ground_truth.txt'

# Initialize the Together API client
client = Together(api_key='961df14a57ad71c7ec591c73955e59825b7b8c57b7d6dea750bceee02fed625b')

# Open the input file and read the content
with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        text_content = line.strip()
        
        # Construct the full prompt by inserting the current line into the one-shot example
        final_prompt = one_shot_prompt.format(input_text=text_content)
        
        # Get predictions from the LLM (without streaming)
        response = client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            messages=[{"role": "user", "content": final_prompt}],
            max_tokens=2048,
            temperature=0.7,
            top_p=0.7,
            top_k=50,
            repetition_penalty=1,
            stop=["<|eot_id|>", "<|eom_id|>"]
        )

        # Extract the LLM's output
        predicted_output = response.choices[0].message.content

        # Define a function to parse the predicted entities
        def parse_entities(predicted_output):
            entities = []
            lines = predicted_output.strip().split("\n")
            for line in lines:
                match = re.match(r'(.+?) "(.+)"', line)
                if match:
                    entity, label = match.groups()
                    entities.append((entity.strip(), label.strip()))
            return entities

        # Parse the predicted entities
        predicted_entities = parse_entities(predicted_output)
        
        # Write the predicted entity-label pairs to the output file
        for entity, label in predicted_entities:
            output_file.write(f"{entity} {label}\n")

# Calculate F1 score using the ground truth file
def calculate_f1_score(predictions_file, ground_truth_file):
    with open(predictions_file, 'r') as pred_file, open(ground_truth_file, 'r') as gt_file:
        predicted_labels = []
        true_labels = []
        
        # Read predicted labels
        for line in pred_file:
            _, label = line.strip().rsplit(' ', 1)
            predicted_labels.append(label)
        
        # Read true labels
        for line in gt_file:
            _, label = line.strip().rsplit(' ', 1)
            true_labels.append(label)
        
        # Calculate F1 score
        return f1_score(true_labels, predicted_labels, average='weighted')

# Path to the predicted and ground truth files
f1 = calculate_f1_score(output_file_path, ground_truth_file_path)
print(f"F1 Score: {f1}")


UnicodeEncodeError: 'charmap' codec can't encode character '\u5168' in position 32: character maps to <undefined>