#  Prompts for automatic, few-shot annotation using GPT-4

In [None]:
import openai
import csv
import json

# OpenAI API
openai.api_key = 'sk-'

task_description = (
    """As an advanced linguist and NLP assistant, your task is to conduct Named Entity Recognition on park reviews. You have data on [Reviews] and [feature]. 
    The [Features] provides definitions for four major classes (with prefixes N_, Act_, F_, and P_) that cover subcategories of park features. "N_" stands for natural features, "F_" for facilities, "Act_" for activities, and "P_" for perceptions. 
    Your goal is to extract all relevant entities from [Reviews] that match definitions in feature_definitions.
    Output directly in the format of few_shot_examples.
    Each numbered sentence outputs a result with the original number.""")

# few-shot learning example
few_shot_examples = (
    "Few shot examples:\n"
    """ [27559	great for running because of the slope and altitude.
         20664	lovely park for a jog or walk, toilet is under repair.]
    output: 
     {"27559": {"Act_active": "running","N_Terrain": "slope, altitude"}}
     {"20664": {"Act_active": "jog, walk", "F_service": "toilet"}}
     """)

reviews = []
with open("GM_split_sentence_150_300.csv", 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in reader:
        try:
            if not row[26] or not row[2]:
                continue
            reviews.append(row[26] + "\t" + row[2])
        except:
            continue

feature_definitions = {
    "Act_stationary": "Stationary activities such as resting, standing, sitting, lying down, waiting.",
    "Act_active": "Moderate or higher intensity physical activities, including all walk, jog.",
    "Act_light": "Light physical activities or activities with uncertain intensity suitable for all ages, like family activities, BBQ.",

    "F_stationary": "Facilities for stationary activities, such as benches, gazebos, pavilion.",
    "F_active": "Facilities for moderate or higher intensity physical activities and exercise.",
    "F_recreation": "Facilities for light recreational activities, like playground, picnic area",
    "F_cultural": "Facilities for light cultural/historical activities, like museums, memorial halls, exhibition, cultural sites",
    "F_catering": "Catering facility/building for eating and drinking, including shops, vending machines, water fountains",
    "F_Transport": "Transport facility, parking lots, bus, subway station",
    "F_Sign": "Facilities and systems providing park information and navigation, including directional signs, maps, and educational panels.",
    "F_path": "All kinds of Trails, path and road.",
    "F_service": "Facilities for supporting service such as visitor center, toilets",
    "F_lights": "lighting equipment",

    "N_Terrain": "Terrain features, such as hill, slopes, stairs, and natural landforms.",
    "N_Plant": "Plant/vegetation. Excluding flower.",
    "N_Flower": "Flower, excluding terms used to describe other objects, like flower shape.",
    "N_Animal": "Animal, excluding terms used to describe other things, like goat ice cream.",
    "N_Water": "Water bodies, such as lakes, rivers, and ponds, excluding drinking water facilities",
    "N_Weather": "Describing or indicating weather.",
    "N_Color":  "Color, excluding terms used to describe other objects, like golden hours.",

    "P_Sound": "Auditory sensory,including natural and artificial sounds.",
    "P_Olf": "Olfaction/smell sensory, like spice.",
    "P_Tactile": "Sensory derived from touching natural and man-made surfaces, like the perception of texture, temperature, moisture, and firmness of surfaces and materials, like breeze blows",
    "P_Crowd": "Terms relating crowd."
}

messages = [
    {"role": "system", "content": task_description},
    {"role": "system", "content": few_shot_examples},
]

f = open("final_file_150300.json", "a")  

for index in range(0, len(reviews), 50):
    temp_reviews = reviews[index: index + 50]
    # review = ['"""' + "\n".join(temp_reviews) + '"""']
    review = "\n".join(temp_reviews)
    messages.append({"role": "user", "content": f"[Review {index // 50 + 1}]: {review}"})
    messages.append({"role": "user", "content": f"[Feature Definitions]: {feature_definitions}"})
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=messages
    )
    f.write(response['choices'][0]['message']['content'])
    f.write("\n")  
    
    print(response['choices'][0]['message']['content'])


# Prompts (lables) for zero-shots classification

In [None]:
from transformers import pipeline
# Specify the device (cuda:0 refers to GPU; use cuda:1, cuda:2, etc., for multiple GPUs)
device = 0
# Load the classifier with GPU support
classifier = pipeline("zero-shot-classification", model="MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli", device=device)

In [None]:
from transformers import pipeline
import pandas as pd

# Load CSV file
aesthetic_df = pd.read_csv('test_lexicon_LLM.csv')

texts_to_classify = aesthetic_df['text'].tolist()

# weather,path, lighting 10
# candidate_labels = [ 
        # "mentions weather",  "does not mention weather", "involves physical paths/pathways",
        # "involves physical tracks for activities", "involves physical trails/roads/walkways",
        # "involves physical stairs/ bridges", "does not involve any physical tracks, trails, paths, roads, pathways, walkways, or stairs",
        # "involves lighting facilities/equipment", "involves lighting levels", "not related to lighting facilities or equipment"
# ]
# terrain, animal, water 10
# candidate_labels = [
#         "relate to terrain", "relate to elevation", 
#         "relate to changes in elevation", "does not talk about any terrain types, flat, elevation or changes in elevation",
#         "references to actual fauna/animals present", "reference to insects", "No references to fauna/animals or insects",  
#         "involves water landscapes", "involves drinking water facilities","not related to water landscapes"
# ]


# sign, sound, olf 11
# candidate_labels = [
#         "involves signage/marking facilities/systems", "mentions getting lost or losing directions" ,
#         "mentions inability to find directions", "not related to signage/marking facilities or systems", 
#         "mentions sound", "mentions noise", "mentions quietness/tranquility", "not related to sound/ noise/quietness/tranquility",
#         "mentions olfaction/smell perception", "mentions fresh air", "not related to olfaction/smell perception or fresh air"
# ]

# F_active,F_recreation, F_cultural, F_Transport 9
# candidate_labels = [
#         "comment facilities for moderate to vigorous physical activity ", 
#         "mentions facilities/structure/courses mainly for exercise",
#         "nothing to do with facilities for moderate to vigorous physical activity",
        
#         "mentions facilities for recreation instaed of for active physical activity", 
#         "does not mention facilities for recreation",
#         "mention facilities for active physical activity instaed of recreation",
        
#         "mentions facilities/installations/sites commemo-rating history, or for cultural and artistic viewing", 
#         "does not mention facilities/installations/sites serving as historical or cultural and artistic points in parks", 
        
#         "mentions facilities for transport", "does not mention facilities for transport"
# ]


# # Act_stationary, Act_light, F_stationary, F_service 10
# candidate_labels = [
#         "mentions sedentary activity, like rest, chill, relax, etc",
#         "relate to static activity",
            
#          "mentions light physical activity instead of moderate-to-vigorous physical activity or exercise",
#         "mentions outdoor activities of light intensity, such as camping, picnicking, fishing, and BBQ",
#         "does not mention any outdoor activities of light intensity, such as camping, picnicking, fishing, and BBQ",
        
#         "mentions facilities/structures providing chances for rest or relative static activity", 
#         "does not mention facilities/structures/building providing chances for rest or relative static activity" ,   
        
#         "involve the evaluation of general service facilities offering sale, repair, rental, emergency, toielt, etc.", 
#         "not involve the evaluation of general service facilities", 
#         "mentions facilities providing convenience service/information/tickets for visitors"
# ]

# # Act_active, P_Crowd, N_Color, F_catering 8
# candidate_labels = [
#         "relate to moderate to vigorous physical activity",
#         "nothing to do with moderate to vigorous physical activity",
#         "mentions facilities/structures/equipments providing drinks/drinking water or food", 
#         "does not mention facilities/structures/equipments providing drinks or food",
#         "mentions color", "does not mention any color",
#         "mentions crowd or not crowd", "nothing about whether crowd or not crowd"
# ]


# N_Plant, N_Flower, P_Tactile 10
# candidate_labels = [
#         "mentions perceptions, like insect bite, hot or cold." ,                  
#         "mentions tactile perceptions, such as feeling breezes, wetness, temperature, etc",
#         "mentions perceptions, like insect bite, breeze, humid, dry, hot or cold."  ,
#         "does not mention any perceptions, like insect bite, breeze, humid, dry, hot or cold.",
        
#         "mentions real vegetations/plants/greenery/flora", 
#         "mentions flora which refer to artificial installations in the shape of flora",
#         "does not mention vegetations/plants/greenery/flora",
            
#         "mention flowers", "comment real plantations with flowers", "does not mention any flowers"
# ]

batch_size = 10  # Set batch size
save_interval = 5  # Save every 5 batches

# Initialize a counter for the number of batches processed
batches_processed = 0

# Process in batches
for start_index in range(0, len(texts_to_classify), batch_size):
    end_index = start_index + batch_size
    batch_texts = texts_to_classify[start_index:end_index]
    
    # Classify the batch
    outputs = classifier(batch_texts, candidate_labels, multi_label=True)
    
    for batch_index, output in enumerate(outputs):
        index = start_index + batch_index  # Original DataFrame index
        
        # Create a dictionary to hold the label and its corresponding score
        labels_scores = {label: score for label, score in zip(output['labels'], output['scores'])}
        
        # Add probabilities to DataFrame
        for label in candidate_labels:
            column_name = f"prob_{label.replace(' ', '_')}"  # Sanitize label to create a valid column name
            aesthetic_df.at[index, column_name] = labels_scores.get(label, 0.0)  # Use 0.0 if label is not found

    batches_processed += 1
    
    # Save after every 5 batches processed
    if batches_processed % save_interval == 0:
        aesthetic_df.to_csv('test_lexicon_LLM.csv', index=False)
        print(f"Processed and saved after batch {batches_processed}")

# Save the final batch if it wasn't saved in the loop
if batches_processed % save_interval != 0:
    aesthetic_df.to_csv('test_lexicon_LLM.csv', index=False)
    print(f"Processed and saved the final batch. Total batches processed: {batches_processed}")

# Print DataFrame including classification results with probabilities
print(aesthetic_df)


# Prompts for identifying sentences containing underrepresented entities

In [None]:
from transformers import pipeline
import pandas as pd

# Load CSV file
aesthetic_df = pd.read_csv('GMTA_Aspect_sentence_prob.csv')

texts_to_classify = aesthetic_df['split_sentence'].tolist()

candidate_labels = [ 
    "Mention facilities for moderate to high intensity physical activity", "Mention carpark/accessibility/transporting commnets",
    "mention sedentary or static activities", "mention transportation related facilities in the park", 
    "mention terrain", "mention lighting", "mention touch hearing smell perception", "mention facilities in park"
    
]

batch_size = 10  # Set batch size
save_interval = 5  # Save every 5 batches

# Initialize a counter for the number of batches processed
batches_processed = 0

# Process in batches
for start_index in range(0, len(texts_to_classify), batch_size):
    end_index = start_index + batch_size
    batch_texts = texts_to_classify[start_index:end_index]
    
    # Classify the batch
    outputs = classifier(batch_texts, candidate_labels, multi_label=True)
    
    for batch_index, output in enumerate(outputs):
        index = start_index + batch_index  # Original DataFrame index
        
        # Create a dictionary to hold the label and its corresponding score
        labels_scores = {label: score for label, score in zip(output['labels'], output['scores'])}
        
        # Add probabilities to DataFrame
        for label in candidate_labels:
            column_name = f"prob_{label.replace(' ', '_')}"  # Sanitize label to create a valid column name
            aesthetic_df.at[index, column_name] = labels_scores.get(label, 0.0)  # Use 0.0 if label is not found

    batches_processed += 1
    
    # Save after every 5 batches processed
    if batches_processed % save_interval == 0:
        aesthetic_df.to_csv('Aspect_sentence_prob.csv', index=False)
        print(f"Processed and saved after batch {batches_processed}")

# Save the final batch if it wasn't saved in the loop
if batches_processed % save_interval != 0:
    aesthetic_df.to_csv('GMTA_Aspect_sentence_prob.csv', index=False)
    print(f"Processed and saved the final batch. Total batches processed: {batches_processed}")

# Print DataFrame including classification results with probabilities
print(aesthetic_df)
