In [None]:
!pip install --upgrade openai
!pip install arabic-reshaper
!pip install pyarabic
!pip install tiktoken
!pip install farasapy

In [None]:
from typing import List
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken
import json
import time
import re


In [None]:

COMPLETIONS_MODEL = "gpt-4"
# COMPLETIONS_MODEL = "gpt-3.5-turbo"
openai.api_key = 'YOUT_OPENAI_API_KEY'
dataset_file = "QQA23_TaskB_qrcd_v1.2_test_preprocessed.jsonl"

In [None]:
def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data


def get_last_token_index(text):
    tokens = text.split()  # Split the string into tokens
    if tokens:
        last_token_index = len(tokens) - 1 # Get the last token index
        return last_token_index
    else:
        return -1  # Return -1 if the string has no tokens


def to_dict(answer, rank, score, start_token_indx, end_token_indx):
    return {
        # "pq_id": pq_id,
        "answer": answer,
        "strt_token_indx":start_token_indx,
        "end_token_indx":end_token_indx,
        "rank":rank,
        "score":score,
        }


def form_one_answer(passage):
    answers_list = []
    answers_list.append(to_dict(answer=passage, rank=1, score=1, start_token_indx=0, end_token_indx=get_last_token_index(passage)))
    return answers_list


In [None]:
def get_phrase_index(text, phrase):
    words = text.split()
    phrase_words = phrase.split()
    for i in range(len(words) - len(phrase_words) + 1):
        if words[i:i + len(phrase_words)] == phrase_words:
            return i  # Return the index of the first word in the phrase

    return -1

In [None]:
def form_answers(passage, answers):
    answers = answers.replace('"', '')  # Remove double quotation marks
    answers = answers.replace("'", '')  # Remove single quotation marks
    answers_list = []
    answers = answers.split("\n")
    i = 1
    for answer in answers:
      if('no answer' in answer.lower()):
        break
      words = answer.split(" ")
      if(len(words) == 0):
        continue
      if(len(words) >= 2):
        start_token_indx = get_phrase_index(passage, " ".join(words[0:2]))
        if start_token_indx == -1:
          continue
      else:
        if words[0] in passage:
          passage_words = passage.split(" ")
          start_token_indx = passage_words.index(words[0])
        else:
          continue

      end_token_indx = start_token_indx + len(words) - 1
      answers_list.append(to_dict(answer=answer, rank=i, score=1, start_token_indx=start_token_indx, end_token_indx=end_token_indx))
      i = i + 1
      if i > 10:
        break
    return answers_list

In [None]:
dataset_jsonl  = load_jsonl(dataset_file)

Loaded 431 records from QQA23_TaskB_qrcd_v1.2_test_preprocessed.jsonl


In [None]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 1000,
    "model": COMPLETIONS_MODEL,
}

In [None]:
def answer_question(
    question: str,
    passage: str,
    show_prompt: bool = False,
) -> str:

    prompt = """أجب على السؤال التالي من النص المرفق فقط . لا تقم بإضافة أية شرح أو أية إجابة من خارج النص. اكتب الإجابة أو الإجابات فقط, إن وجدت أكثر من إجابة اكتبها على شكل تعدادات. الاجابة يجب أن تكون فقط المقطع أو المقاطع التي تحوي الجواب بدون أية زيادة. اجعل كل مقطع في سطر منفصل. إن لم توجد إجابة، اكتب: "No Answer"
."\n\n""" + question + "\n" + passage

    if show_prompt:
        print(prompt)

    response = openai.ChatCompletion.create(
                messages=[
                    {"role": "system", "content" : "أنت عالم في اللغة العربية وعلوم القرآن"},
                    {"role": "user", "content" : prompt}],
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["message"]["content"]

In [None]:
all_questions = {}
answer = ''
answers = []
i = 0
for pq_dict in dataset_jsonl:
    pq_id = pq_dict['pq_id']
    passage = pq_dict['passage']
    question = pq_dict['question']
    i = i + 1
    if i % 90 == 0:
      # wait for half a minute each 90 iteration in order to exceed the tokens per minute limit
      time.sleep(30)
    try:
      answer = answer_question(question, passage, False)
    except:
      # exception could happen when exceeding the tokens per minute limit, so we try to get the answer again after waiting half a minute
      try:
        time.sleep(30)
        answer = answer_question(question, passage, False)
      except:
        print("error in ", pq_id)
        answer = 'error'
    answers.append(answer)
    answers_list = form_answers(passage, answer)
    all_questions.update({pq_id: answers_list})

In [None]:
def clean_string(input_string):
    # Remove symbols using regex
    cleaned_string = re.sub(r'[^\w\s]', '', input_string)
    # Remove leading and trailing whitespaces
    cleaned_string = cleaned_string.strip()
    return cleaned_string

In [None]:

# preprocessing the answers
all_questions_proc = {}
errors = []
i = 0
for pq_dict in dataset_jsonl:
    pq_id = pq_dict['pq_id']
    passage = pq_dict['passage']
    question = pq_dict['question']

    answer = answers[i]

    answers_list = form_answers(passage, clean_string(answer))
    if answers_list == -2:
      errors.append(i)
    else:
      all_questions_proc.update({pq_id: answers_list})
    i = i + 1

In [None]:
# saving the answers in the required format to a json file
save_path = "AlJawaab_gpt4.json"
with open(save_path, "w", encoding="utf-8") as outfile:
    json.dump(all_questions, outfile, ensure_ascii=False)
    print("Json file was saved into this path: ",save_path )

Json file was saved into this path:  abdul_tpgp4.json


In [None]:
# saving the processed answers in the required format to a json file
save_path = "AlJawaab_pgpt4.json"
with open(save_path, "w", encoding="utf-8") as outfile:
    json.dump(all_questions_proc, outfile, ensure_ascii=False)
    print("Json file was saved into this path: ",save_path )

In [None]:
# Saving the original answers to a json file
save_path = "answers_gpt4.json"
with open(save_path, "w", encoding="utf-8") as outfile:
    json.dump(answers, outfile, ensure_ascii=False)
    print("Json file was saved into this path: ",save_path )


Json file was saved into this path:  answers_ptest.json


In [None]:
# evaluating dev or training dataset
! python QQA23_TaskB_eval.py \
    --run_file "FILE_NAME.jsonl" \
    --gold_answers_file "GOLDEN_ANSWER_FILE.jsonl"

Loaded 163 records from QQA23_TaskB_qrcd_v1.2_dev_preprocessed.jsonl
pAP@10 = 0.470 


In [None]:
# run submission checker for test dataset
! python QQA23_TaskB_submission_checker.py \
    --run_file "TEST_FILE_NAME.json"
# expected output:
# Loaded 163 records from ../QQA23_TaskB_qrcd_v1.2_dev_preprocessed.jsonl
# pAP@10 = 25.484
