In [162]:
import pandas as pd
from tenacity import retry, wait_random_exponential, stop_after_attempt
import logging
import json
from tqdm.auto import tqdm

In [50]:
import os
from dotenv import load_dotenv
load_dotenv("../.env")

True

In [51]:
# import openai
# openai.api_key = os.getenv("OPENAI_API_KEY")

In [209]:
# new api
from openai import OpenAI
from openai import OpenAIError

client = OpenAI(
  api_key=os.getenv('OPENAI_API_KEY'),  # this is also the default, it can be omitted
)

In [216]:
raw_preds = pd.read_csv("../submissions/multibert_ukru_lingua_00_ot_stoprows4_n.csv", converters={"locations": eval})
df_test = pd.read_csv("../data/nlp-ua-locations-extractions/test.csv")
raw_preds['text'] = df_test['text']

In [144]:
system_message = """
You are a professional datasets labelist. Your job is labeling datasets for further usage in machine learning tasks. Now you are working on the dataset for NER task, which focuses on location extraction. 
You will be given a couple of text samples in Ukrainian and Russian languages along with the set of possible location entities in them. Your task is to inspect each suggested entity and make a verdict on whether it's a valid location entity in the given context or not and explain your decision.  
""" 

user_message_example = """
Text: "–†–û–ó–®–£–ö–£–Æ–¢–¨–°–Ø –≤–ª–∞—Å–Ω–∏–∫–∏ –ø–µ—Å–∏–∫–∞ –∑–æ–±—Ä–∞–∂–µ–Ω–æ–≥–æ –Ω–∞ —Ñ–æ—Ç–æ —è–∫–∏–π –±–ª–∏–∑—å–∫–æ 1-2 —Ç–∏–∂–Ω—ñ–≤ –ø–µ—Ä–µ–±—É–≤–∞—î –Ω–∞ —Ç–µ—Ä–∏—Ç–æ—Ä—ñ—ó –î–∞—Ä–Ω–∏—Ü—å–∫–æ–≥–æ —Ä–∞–π–æ–Ω—É (–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9, –∫–æ—Ä–ø—É—Å 54, —Ç–µ—Ä–∏—Ç–æ—Ä—ñ—è –∫–æ–ª–∏—à–Ω—å–æ–≥–æ —Ä–∞–¥—ñ–æ–∑–∞–≤–æ–¥—É)

üê∂–º–æ–∂–ª–∏–≤–æ —Ö—Ç–æ—Å—å –≤–ø—ñ–∑–Ω–∞—î —Å–≤–æ–≥–æ –¥–æ–º–∞—à–Ω—å–æ–≥–æ —É–ª—é–±–ª–µ–Ω—Ü—è —Ç–∞ –∑–∞–±–µ—Ä–µ –¥–æ–¥–æ–º—Éüè†

üü•–•–∞—Ä–∞–∫—Ç–µ—Ä–Ω—ñ –æ–∑–Ω–∞–∫–∏ –ø–µ—Å–∏–∫–∞:
- –∫–æ–ª—ñ—Ä —á–æ—Ä–Ω–æ-–∫–æ—Ä–∏—á–Ω–µ–≤–∏–π;
- –æ—à–µ–π–Ω–∏–∫ –∑–µ–ª–µ–Ω–æ–≥–æ –∫–æ–ª—å–æ—Ä—É (—Ö–∞–∫—ñ);
-—Ö–≤–æ—Å—Ç–∏–∫ –Ω–∞ –∫—ñ–Ω—á–∏–∫—É –∑—ñ —Å—Ç—Ä–∏–∂–∫–æ—é.

‚òùüèª–ó–≤–µ—Ä–Ω—É—Ç–∏—Å—è –º–æ–∂–Ω–∞ –¥–æ –º–µ—à–∫–∞–Ω—Ü—ñ–≤ –±–∞–≥–∞—Ç–æ–∫–≤–∞—Ä—Ç–∏—Ä–Ω–æ–≥–æ –±—É–¥–∏–Ω–∫—É –∑–∞ –∞–¥—Ä–µ—Å–æ—é –ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9, —è–∫—ñ –π–æ–≥–æ —Ç—É—Ç –±–∞—á–∞—Ç—å —â–æ–¥–Ω—è. 
–£ –≤–∏–ø–∞–¥–∫—É  —è–∫—â–æ –≤–ø—ñ–∑–Ω–∞–ª–∏ –ø–µ—Å–∏–∫–∞ —Ç–∞ –Ω–µ –∑–Ω–∞—î—Ç–µ –¥–µ –∑–Ω–∞—Ö–æ–¥–∏—Ç—å—Å—è —Ç–µ—Ä–∏—Ç–æ—Ä—ñ—è —Ç–µ–ª–µ—Ñ–æ–Ω—É–π—Ç–µ –∑–∞ –Ω–æ–º–µ—Ä–æ–º 0932626376."
Entities: ['–î–∞—Ä–Ω–∏—Ü—å–∫–æ–≥–æ —Ä–∞–π–æ–Ω—É', '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9', '–∫–æ—Ä–ø—É—Å 54', '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9']
"""

assisstant_message_example = """
[[
    {
        'entity': '–î–∞—Ä–Ω–∏—Ü—å–∫–æ–≥–æ —Ä–∞–π–æ–Ω—É'
        'valid': "YES",
        'explanation': "Refers to the district of city in which a puppy was lost"
    },
    {
        'entity': '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9'
        'valid': "NO",
        'explanation': "only part of the full address '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9, –∫–æ—Ä–ø—É—Å 54' which is given in the text"
    },
    {
        'entity': '–∫–æ—Ä–ø—É—Å 54'
        'valid': "NO",
        'explanation': "only part of the full address '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9, –∫–æ—Ä–ø—É—Å 54' which is given in the text"
    },
    {
        'entity': '–ë–æ—Ä–∏—Å–ø—ñ–ª—å—Å—å–∫–∞, 9'
        'valid': "YES",
        'explanation': "address of the mentioned citizen, it's a complete one in the context"
    }
]] 
"""

def user_message(text, entities):
    return f"""
    Text: "{text}"
    Entities: {entities} 
    """

In [145]:
def generate_functions() -> list:
    return [
        {
            "name": "extract_entities",
            "description": "Select valid entities from the full explanations",
            "type": "object",
            "parameters": {
                "type": "object",
                "properties": {
                    "entities": {
                        "description": "entities reports",
                        "type": "array",
                        "items": {
                            "type": "object",
                                "properties": {
                                    "entity": {
                                        "type": "string"
                                    },
                                    "valid": {
                                        "type": "string"
                                    },
                                    "explanation": {
                                        "type": "string"
                                    }
                                }
                        }
                    }
                }
            },
        }
    ]

In [146]:
def extract_entities(entities):
    res = []
    for entity in entities:
        if entity['valid'] == "YES":
            res.append(entity['entity'])
        
    return res

In [229]:
# @retry(wait=wait_random_exponential(min=1, max=4), stop=stop_after_attempt(2))
def run_openai_task(text, entities):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message_example},
        {"role": "assistant", "content": assisstant_message_example},
        {"role": "user", "content": user_message(text=text, entities=entities)}
    ]

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo-0613",
            messages=messages,
            functions=generate_functions(),
            function_call={"name": "extract_entities"}, 
            temperature=0,
            frequency_penalty=0,
            presence_penalty=0,
        )

        response_message = response.choices[0].message

        available_functions = {"extract_entities": extract_entities}  
        function_name = response_message.function_call.name

        function_to_call = available_functions[function_name]
        logging.info(f"function_to_call: {function_to_call}")

        function_args = json.loads(response_message.function_call.arguments)['entities']
        logging.info(f"function_args: {function_args}")

        function_response = function_to_call(function_args)
        response = response.model_dump()
    except Exception as error:
        print(error)
        function_response = entities
        response = {"error": error}

    return {"model_response": response,
            "function_response": function_response}

In [237]:
locations = []
responses = []

for i, row in tqdm(raw_preds.iterrows(), total=len(raw_preds)):
    if len(row['locations']) == 0:
        locations.append([])
        responses.append({})
        continue

    res = run_openai_task(text=row['text'], entities=row['locations'])

    locations.append(res['function_response'])
    responses.append(res['model_response'])

  0%|          | 0/477 [00:00<?, ?it/s]

Expecting value: line 29 column 17 (char 477)
Expecting value: line 9 column 17 (char 170)


KeyboardInterrupt: 

In [239]:
len(locations)

433

In [241]:
locations[432]

['–ê—Å–∫–∞–Ω—ñ—è-–ù–æ–≤–∞', '–•–µ—Ä—Å–æ–Ω—Å—å–∫—ñ–π –æ–±–ª–∞—Å—Ç—ñ', '–ø–∞—Ä–∫—É "–ê—Å–∫–∞–Ω—ñ—ó-–ù–æ–≤–æ—ó"']

In [250]:
locations += raw_preds.loc[433:, 'locations'].to_list()

In [251]:
len(locations)

477

In [252]:
raw_preds['locations_new'] = locations

In [254]:
raw_preds.to_csv("../submissions/gpt_corrections_091.csv", index=None)

In [267]:
from json.decoder import JSONDecodeError

In [280]:
responses_fixed = []
for resp in responses:
    if 'error' in resp and type(resp['error']) == JSONDecodeError:
        responses_fixed.append({'error': str(resp['error'])})
    else:
        responses_fixed.append(resp)

In [281]:
json.dump(responses_fixed, fp=open('reponses.json', 'w'))

In [282]:
raw_preds

Unnamed: 0,text_id,locations,text,locations_new
0,0,[],"‚ùóÔ∏è–ö—ñ–ª—å–∫—ñ—Å—Ç—å –ø–æ—Ä–∞–Ω–µ–Ω–∏—Ö –∑—Ä–æ—Å–ª–∞ –¥–æ —Ç—Ä—å–æ—Ö, ‚Äì –ö–ª–∏—á–∫...",[]
1,1,"[–ö–∏—î–≤—ñ, –®—É–ª—è–≤—Å—å–∫–æ–≥–æ —à–ª—è—Ö–æ–ø—Ä–æ–≤–æ–¥—É, –®—É–ª—è–≤—Å—å–∫–æ–≥–æ]","ü•§–í –ö–∏—î–≤—ñ –∑–∞ 91,13 –º–ª–Ω –≥—Ä–∏–≤–µ–Ω—å –ø–æ—á–∏–Ω–∞—é—Ç—å—Å—è —Ä–æ–±–æ...","[–ö–∏—î–≤—ñ, –®—É–ª—è–≤—Å—å–∫–æ–≥–æ —à–ª—è—Ö–æ–ø—Ä–æ–≤–æ–¥—É]"
2,2,"[–ì–æ–≥–æ–ª–µ–≤–µ, –ú–∏—Ä–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ —Ä–∞–π–æ–Ω—É, –ü–æ–ª—Ç–∞–≤—Å—å–∫–æ—ó ...",‚ñ™Ô∏è–°—å–æ–≥–æ–¥–Ω—ñ –≤–Ω–æ—á—ñ —Ä–æ—Å—ñ—è–Ω–∏ –∑–∞–≤–¥–∞–ª–∏ —Ä–∞–∫–µ—Ç–Ω–æ–≥–æ —É–¥–∞...,"[–ì–æ–≥–æ–ª–µ–≤–µ, –ú–∏—Ä–≥–æ—Ä–æ–¥—Å—å–∫–æ–≥–æ —Ä–∞–π–æ–Ω—É, –ü–æ–ª—Ç–∞–≤—Å—å–∫–æ—ó ..."
3,3,[],–ù–∞—Ä–∞–∑—ñ —É –∑–∞–ø–∞—Å–∞—Ö —Ä–æ—Å—ñ—è–Ω –Ω–∞–π–±—ñ–ª—å—à–µ –±–∞–ª—ñ—Å—Ç–∏—á–Ω–∏—Ö ...,[]
4,4,"[–ø—Ä–æ—Å–ø–µ–∫—Ç—ñ –ê–∫–∞–¥–µ–º—ñ–∫–∞ –ì–ª—É—à–∫–æ–≤–∞, 9]","‚õ∏–í –æ–¥–∏–Ω –¥–µ–Ω—å, 29 —Å–µ—Ä–ø–Ω—è, –î–ü ""–¶–µ–Ω—Ç—Ä–∞–ª—å–Ω–∞ —É—á–±–æ–≤–æ...",[]
...,...,...,...,...
472,472,"[–ö–∏—ó–≤—â–∏–Ω–∏, –£–∫—Ä–∞—ó–Ω—ñ, –ö–∏—ó–≤—Å—å–∫–æ—ó –æ–±–ª–∞—Å—Ç—ñ, –ë—É—á–∞–Ω—Å—å...",–î–æ–ø–æ–º–æ–≥–∞ –Ω–∞ –≤—ñ–¥–Ω–æ–≤–ª–µ–Ω–Ω—è: –ø‚Äô—è—Ç—å –≥—Ä–æ–º–∞–¥ –ö–∏—ó–≤—â–∏–Ω–∏...,"[–ö–∏—ó–≤—â–∏–Ω–∏, –£–∫—Ä–∞—ó–Ω—ñ, –ö–∏—ó–≤—Å—å–∫–æ—ó –æ–±–ª–∞—Å—Ç—ñ, –ë—É—á–∞–Ω—Å—å..."
473,473,[–ó–µ–º–ª—ñ],–ü—Ä–æ—Ä–æ—Ü—Ç–≤–æ –°—Ç—Ä—É–≥–∞—Ü—å–∫–∏—Ö: –Ω–µ–∑–≤–∏—á–∞–π–Ω–∞ –±–∞–∫—Ç–µ—Ä—ñ—è –ë—Ä–∞...,[–ó–µ–º–ª—ñ]
474,474,[],–Ü–∑ 1 –≤–µ—Ä–µ—Å–Ω—è –£–ì–ö–¶ —ñ –ü–¶–£ –ø–µ—Ä–µ–π—à–ª–∏ –Ω–∞ –Ω–æ–≤–∏–π —Ü–µ—Ä–∫...,[]
475,475,"[–ö–∏—î–≤—ñ, –î–Ω—ñ–ø—Ä–∞, –î–Ω—ñ–ø—Ä–æ]",ü¶á –£ –ö–∏—î–≤—ñ –∑ –î–Ω—ñ–ø—Ä–∞ –≤–∏–ª–æ–≤–∏–ª–∏ —ñ–Ω–æ–∑–µ–º—Ü—è –≤ –º–∞—Å—Ü—ñ –ë...,"[–ö–∏—î–≤—ñ, –î–Ω—ñ–ø—Ä–∞, –î–Ω—ñ–ø—Ä–æ]"


In [283]:
best_preds = pd.read_csv("../submissions/multibert_ukru_lingua_0906_ot_stoprows4_n.csv")

In [285]:
best_preds.loc[:433, 'locations'] = raw_preds.loc[:433, "locations_new"]

In [288]:
best_preds.to_csv("../submissions/multibert_ukru_lingua_0906_ot_stoprows4_n_gpt091.csv", index=None)