# Preprocess the results of LLM prompting methods for all annotators

### Import libraries and install the embedding model

In [None]:
!pip install sentence-transformers

In [None]:
import os
import re
import json
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict, Counter

### File paths to preprocess

In [38]:
# List of files to preprocess
file_paths = [
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_20-20-45.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_20-34-02.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_20-49-27.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_21-04-30.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_21-21-15.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_21-36-55.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_21-53-17.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_22-06-36.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_22-21-36.txt',
    'results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_22-36-29.txt'
]

# The training method
method = "RaR-Few-shot-CoT-Emotion-Prompt"

# Number of annotators
number_of_annotators = 5

In [39]:
def save_results_per_annotator(file_path, output_dir):
    """
    Save results per annotator

    Args:
        file_path (str): the name of the file used to save the individual annotators results
        output_dir (str): the output directory where to save the individual results
    """
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            try:
                entry = json.loads(line)  # Try to parse each line as a JSON object
                text = entry["text"]
                content = entry["content"]
                annotator_pattern = re.compile(r"Annotator (\d+) - \{(.*?)\}", re.DOTALL)
                annotators = annotator_pattern.findall(content)
                base_name = os.path.splitext(os.path.basename(file_path))[0]

                for annotator_id, topics_raw in annotators:
                    topics_formatted = "{" + topics_raw + "}"
                    output_path = os.path.join(output_dir, f'{base_name}_Annotator_{annotator_id}.txt')
                    result = "Text: "+ text + '\n' + topics_formatted + '\n\n'
                    with open(output_path, 'a', encoding='utf-8') as output_file:
                        print(result)
                        output_file.write(result)
            except json.JSONDecodeError as e:
                print(f"Failed to decode JSON from line in file {file_path}: {str(e)}")

Text: The market is not a reliable partner. The yield is important and nature and the environment are secondary, if damage occurs there then so be it. Until now, the working method was as just described. Perhaps the municipality can play a coercive role in this and only cooperate with market parties if the damage is 0 or repaired as much as possible at the expense of the market party.
{'Topics': Municipality and residents engagement in the energy sector, Market Determination Dynamics}


Text: The market is not a reliable partner. The yield is important and nature and the environment are secondary, if damage occurs there then so be it. Until now, the working method was as just described. Perhaps the municipality can play a coercive role in this and only cooperate with market parties if the damage is 0 or repaired as much as possible at the expense of the market party.
{'Topics': Energy storage and supplying energy in The Netherlands, Wind and solar energy}


Text: The market is not a re

In [None]:
output_dir = "individual_annotators_results"
for file_path in file_paths:
    save_results_per_annotator(file_path, output_dir)

print("Processing completed. Processed files are saved in the 'individual_annotators_results' folder.")

In [42]:

# Mapping dictionary for topics
topic_mapping = {
    "municipality and residents engagement in the energy sector": 1,
    "residents engagement": 1,
    "municipality engagement": 1,
    "residents engagement in the energy sector": 1,
    "municipality engagement in the energy sector": 1,
    "energy storage and supplying energy in the netherlands": 2,
    "energy storage and supply in the netherlands": 2,
    "energy storage in the netherlands": 2,
    "supplying energy in the netherlands": 2,
    "energy storage": 2,
    "wind and solar energy": 3,
    "solar energy": 3,
    "wind energy": 3,
    "market determination dynamics": 4,
    "market dynamics": 4,
    "market determination": 4,
    "landscapes and windmills tourism": 5,
    "landscapes and windmills": 5,
    "hydrogen energy pipeline networks": 6,
    "hydrogen energy pipeline": 6
}

# Load the Sentence Transformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Pre-compute embeddings for topic mapping keys
topic_embeddings = {topic: model.encode(topic, convert_to_tensor=True) for topic in topic_mapping.keys()}

def is_valid_topic(topic):
    """
    Check if a topic string is valid (not empty) when removing special characters

    Args:
        topic (str): the topic string
    Returns:
        bool : True if topic is valid, False otherwise

    """
    # Remove special characters and check if the resulting string is empty
    return re.sub(r'[^A-Za-z0-9]+', '', topic).strip() != ''

def process_topics(topics):
    """
    Map the topic list to the original list of topics
    The mapping is done using Sentence-Transformers embedding model (if the topic is not in 'topic_mapping')

    Args:
        topics (list): the list of topics to be preprocessed
    Returns:
        topic_numbers (list) : list of indices indicating the mapping of the topics to the original list

    """
    # Extract numbers and topic strings from the topics
    topic_numbers = set()
    threshold = 0.70  # Cosine similarity threshold for similarity

    if isinstance(topics, int):
        topic_numbers.add(topics)
    elif isinstance(topics, list):
        for topic in topics:
            if isinstance(topic, int):
                topic_numbers.add(topic)
            elif topic.isdigit():
                topic_numbers.add(int(topic))
            else:
                topic_lower = topic.lower().strip()  # Convert to lowercase
                if is_valid_topic(topic_lower):
                    topic_embedding = model.encode(topic_lower, convert_to_tensor=True)
                    # Perform cosine similarity matching
                    similarities = {key: util.pytorch_cos_sim(topic_embedding, emb).item() for key, emb in topic_embeddings.items()}
                    matched_topic, score = max(similarities.items(), key=lambda x: x[1])
                    if score >= threshold:
                        topic_numbers.add(topic_mapping[matched_topic])
                    else:
                        print(f"Low match score for topic '{topic}': {score}")
                else:
                    print(f"Invalid topic '{topic}'")
    elif isinstance(topics, str):
        topic_lower = topics.lower().strip()  # Convert to lowercase
        if is_valid_topic(topic_lower):
            topic_embedding = model.encode(topic_lower, convert_to_tensor=True)
            # Perform cosine similarity matching
            similarities = {key: util.pytorch_cos_sim(topic_embedding, emb).item() for key, emb in topic_embeddings.items()}
            matched_topic, score = max(similarities.items(), key=lambda x: x[1])
            if score >= threshold:
                topic_numbers.add(topic_mapping[matched_topic])
            else:
                print(f"Low match score for topic '{topics}': {score}")
        else:
            print(f"Invalid topic '{topics}'")

    # Remove None values and sort the topics
    topic_numbers = sorted([t for t in topic_numbers if t is not None])
    return topic_numbers

def preprocess_file(file_path, processed_data_dir):
    """
    Preprocess file with results to map the topics to the original topics list
    Writes the preprocessed file in a new file with the name 'file_path_processed'  in the folder 'results'
    Args:
        file_path (str): the file path to preprocess
        processed_data_dir (str): the output to save the file

    """
    processed_lines = []

    print(f"Processing file: {file_path}")

    with open(file_path, 'r', encoding='latin1') as file:
        lines = file.readlines()
        i = 0
        while i < len(lines):
            line = lines[i].strip()
            if line.lower().startswith("text:"):
                text_line = line
                print(f"Found text: {text_line}")
                i += 1
                while i < len(lines) and not lines[i].strip().lower().startswith("text:"):
                    line = lines[i].strip()
                    if re.search(r"\{'Topics':(.+?)\}", line):
                        match = re.search(r"\{'Topics':(.+?)\}", line)
                        topics_str = match.group(1).strip()
                    elif re.search(r"\['Topics':(.+?)\]", line):
                        match = re.search(r"\['Topics':(.+?)\]", line)
                        topics_str = match.group(1).strip()
                    else:
                        i += 1
                        continue

                    print(f"Found topics string: {topics_str}")

                    try:
                        topics = json.loads(topics_str.replace("'", "\""))
                        if isinstance(topics, str):
                            topics = [topics]
                        elif isinstance(topics, dict):
                            topics = [topics]
                    except json.JSONDecodeError:
                        topics = re.findall(r"'(.*?)'", topics_str) + re.findall(r"(\d+)", topics_str)

                    if not topics:
                        topics = topics_str.split(', ')

                    print(f"Extracted topics: {topics}")
                    processed_topics = process_topics(topics)
                    print(f"Processed topics: {processed_topics}")
                    processed_lines.append(f"{text_line}\n")
                    processed_lines.append(f"{{'Topics': {', '.join(map(str, processed_topics))}}}\n")
                    processed_lines.append("\n")
                    break
            elif line.strip().startswith("['Topics") or line.strip().startswith("Topics") or line.strip().startswith("'Topics"):
                topics_str = line.split(":", 1)[1].strip().strip('[]')
                print(f"Found topics string: {topics_str}")

                try:
                    topics = json.loads(topics_str.replace("'", "\""))
                    if isinstance(topics, str):
                        topics = [topics]
                    elif isinstance(topics, dict):
                        topics = [topics]
                except json.JSONDecodeError:
                    topics = re.findall(r"'(.*?)'", topics_str) + re.findall(r"(\d+)", topics_str)

                if not topics:
                    topics = topics_str.split(', ')

                print(f"Extracted topics: {topics}")
                processed_topics = process_topics(topics)
                print(f"Processed topics: {processed_topics}")
                processed_lines.append(f"{{'Topics': {', '.join(map(str, processed_topics))}}}\n")
                processed_lines.append("\n")
            i += 1
    # Save the processed file to the processed_data directory
    output_path = os.path.join(processed_data_dir, os.path.basename(file_path).replace(".txt", "_processed.txt"))
    with open(output_path, 'w', encoding='latin1') as output_file:
        output_file.writelines(processed_lines)
    print(f"Finished processing file: {file_path}")
    print(f"Processed file saved to: {output_path}\n")



In [43]:
# Ensure the processed_data directory exists
processed_data_dir = 'processed_data'
os.makedirs(processed_data_dir, exist_ok=True)
output_dir = "individual_annotators_results"
for annotator_id in range(5):
    file_paths_per_annotator = [
        f"individual_annotators_results/{fp.replace('results/', '').replace('.txt', '_Annotator_' + str(annotator_id + 1) +  '.txt')}"
        for fp in file_paths
    ]
    for file in file_paths_per_annotator:
        preprocess_file(file, processed_data_dir)

Processing file: individual_annotators_results/RaR_Emotion_Prompt_CoT_Few_shot_all_annotators_2024-06-03_20-20-45_Annotator_1.txt
Found text: Text: The market is not a reliable partner. The yield is important and nature and the environment are secondary, if damage occurs there then so be it. Until now, the working method was as just described. Perhaps the municipality can play a coercive role in this and only cooperate with market parties if the damage is 0 or repaired as much as possible at the expense of the market party.
Found topics string: Municipality and residents engagement in the energy sector, Market Determination Dynamics
Extracted topics: ['Municipality and residents engagement in the energy sector', 'Market Determination Dynamics']
Processed topics: [1, 4]
Found text: Text: Not a good plan! Generating far too much energy in our municipality to provide parts of the country with energy. See point 5!
Found topics string: Municipality and residents engagement in the energy sec

In [45]:
def combine_results(file_paths):
    """
    Combine results from multiple files
    For each text -> topics from each file

    Args:
        file_paths (list): the list of file paths that has been preprocessed
    Returns:
        combined_results (list) : list combined results

    """
    combined_results = defaultdict(list)

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            content = file.read().strip().split("\n\n")
            for entry in content:
                if 'Topics' in entry:
                    try:
                        text_part, label_part = entry.split("{'Topics': ")
                        text = text_part.replace("Text: ", "").strip()
                        topics = "{'Topics': " + label_part.strip()
                        combined_results[text].append(topics)
                    except ValueError:
                        print(f"Skipping entry due to parsing error: {entry}")

    return combined_results

def aggregate_results(combined_results):
    """
    Aggregate the topics results from multiple files using majority vote

    Args:
        combined_results (list): the list of combined results from preprocessed file (for each text -> topics from each file)
    Returns:
        aggregated_results (list) : list of aggregated results using majority vote

    """
    aggregated_results = {}

    for text, topics in combined_results.items():
        all_topics = [topic.strip() for topic_list in topics for topic in topic_list.replace("{'Topics': ", "").replace("}", "").split(',')]
        topic_counts = Counter(all_topics)
        majority_topics = [topic for topic, count in topic_counts.items() if count >= (len(file_paths) / 2)]
        aggregated_results[text] = majority_topics

    return aggregated_results

def save_combined_results(combined_results, output_path):
    """
    Save in a new file the combined results from multiple files
    For each text -> topics from each file

    Args:
        combined_results (list): the list of combined results from preprocessed file (for each text -> topics from each file)
        output_path (str): the output path of the new file

    """
    with open(output_path, 'w') as file:
        for text, topics_list in combined_results.items():
            file.write(f"Text: {text}\n")
            for topics in topics_list:
                file.write(f"{topics}\n")
            file.write("\n")

def save_aggregated_results(aggregated_results, output_path):
    """
    Save the aggregated results

    Args:
        aggregated_results (list) : list of aggregated results using majority vote
        output_path (str): the output path of the new file
    """
    with open(output_path, 'w') as file:
        for text, topics in aggregated_results.items():
            file.write(f"Text: {text}\n")
            file.write(f"Assigned Topics: {', '.join(topics)}\n\n")

aggregated_results/Combined_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_1.txt aggregated_results/Aggregated_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_1.txt
aggregated_results/Combined_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_2.txt aggregated_results/Aggregated_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_2.txt
aggregated_results/Combined_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_3.txt aggregated_results/Aggregated_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_3.txt
aggregated_results/Combined_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_4.txt aggregated_results/Aggregated_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_4.txt
aggregated_results/Combined_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_5.txt aggregated_results/Aggregated_Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_5.txt


In [None]:
for annotator_id in range(5):
    output_combined_text_path = 'aggregated_results/Combined_Results_' + method + "_Annotator_"+ str(annotator_id + 1) + '.txt'
    output_aggregated_text_path = 'aggregated_results/Aggregated_Results_' + method + "_Annotator_"+ str(annotator_id + 1) + '.txt'
    file_paths_per_annotator = []
    for fp in file_paths:
        file_paths_per_annotator.append(f"processed_data/{fp.replace('results/', '').replace('.txt', '_Annotator_' + str(annotator_id + 1) +  '_processed.txt')}")

    # Combine the results from the text files into the desired format
    combined_results = combine_results(file_paths_per_annotator)

    # Save the combined results to a text file
    save_combined_results(combined_results, output_combined_text_path)

    # Aggregate the results using majority vote
    aggregated_results = aggregate_results(combined_results)

    # Save the aggregated results to a text file
    save_aggregated_results(aggregated_results, output_aggregated_text_path)

    print(output_combined_text_path, output_aggregated_text_path)


In [46]:
def load_aggregated_data_csv(annotator_id):
    """
    Load aggregated results in a csv for each annotator

    Args:
        annotator_id (list) : the annotator id
    """
    # Load the provided empty CSV file
    empty_df = pd.read_csv('Empty_data_set.csv', delimiter=';')

    # Define the mapping of labels to column names
    label_to_column = {
        '1': 'Municipality and residents engagement in the energy sector',
        '2': 'Energy storage and supplying energy in The Netherlands',
        '3': 'Wind and solar energy',
        '4': 'Market Determination Dynamics',
        '5': 'Landscapes and windmills tourism',
        '6': 'Hydrogen energy pipeline networks'
    }

    # Initialize a dictionary to store text to index mapping
    text_to_index = {}

    # Map text to their row index in the empty CSV file
    for idx, row in empty_df.iterrows():
        text = row['english'].strip()
        text_to_index[text] = idx

    # Initialize all columns to '0'
    for column in label_to_column.values():
        empty_df[column] = '0'

    # Fill the columns based on the aggregated labels from the Aggregated_Results.txt file
    output_aggregated_text_path = "aggregated_results/Aggregated_Results_"+ method + "_Annotator_"+ str(annotator_id + 1) + '.txt'
    with open(output_aggregated_text_path, 'r') as file:
        content = file.read().strip().split("\n\n")
        for entry in content:
            lines = entry.split("\n")
            if len(lines) < 2:
                continue
            text = lines[0].replace("Text: ", "").strip()
            topics_line = lines[1].replace("Assigned Topics: ", "").strip()
            topics = topics_line.split(", ") if topics_line else []
            if text in text_to_index:
                idx = text_to_index[text]
                for label, column_name in label_to_column.items():
                    if label in topics:
                        empty_df.at[idx, column_name] = '1'

    # Save the filled CSV file
    filled_csv_path_final = 'llm_annotated_data/Results_' + method + "_Annotator_"+ str(annotator_id + 1) + '.csv'
    empty_df.to_csv(filled_csv_path_final, index=False, sep=';')

    print(filled_csv_path_final)

llm_annotated_data/Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_1.csv
llm_annotated_data/Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_2.csv
llm_annotated_data/Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_3.csv
llm_annotated_data/Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_4.csv
llm_annotated_data/Results_RaR-Few-shot-CoT-Emotion-Prompt_Annotator_5.csv


In [46]:
for annotator_id in range(number_of_annotators):
    load_aggregated_data_csv(annotator_id)