In [None]:
!pip install arabic-reshaper
!pip install pyarabic
!pip install pytrec_eval
!pip install --upgrade openai
!pip install tiktoken

In [None]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken
from typing import List
import json


In [None]:

COMPLETIONS_MODEL = "gpt-4"
# COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"
openai.api_key = 'YOUR_OPENAI_API_KEY'
threshold = 0.8

In [None]:
MAX_SECTION_LEN = 3000
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

In [None]:
# reading the Quranic passages
with open('QQA23_TaskA_QPC_v1.1.tsv', 'r') as f:
    text = f.read()

paragraphs = []
current_paragraph = ""
# text = text.replace('\n', ' ')
sentences = text.split('\n')
# chunk_size = 500
current_word_count = 0
current_chunk = []
output_chunks = []
ids = []
questions = []

for sentence in sentences:
    words = sentence.split("\t")
    if(len(words) == 2):
      ids.append(words[0])
      paragraphs.append(words[1])


In [None]:
df = pd.DataFrame()
df['id'] =ids
df['content'] =paragraphs

In [None]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) :
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        r.id: get_embedding(r.content) for idx, r in df.iterrows()
    }

In [None]:
def load_embeddings(fname: str):
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns:
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [None]:
def vector_similarity(x , y ) :
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query , contexts ) :
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [None]:
def retrieve(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch most 10 similar passages to a question
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    chosen_sections = []
    scores = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    i = 0
    for score, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        # document_section = df.loc[int(section_index)]

        i = i + 1
        if i > 10:
            break

        # chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(section_index)
        scores.append(score)

    return chosen_sections_indexes, scores

In [None]:
# calculate the encodings for the passages
document_embeddings = compute_doc_embeddings(df)

In [None]:
dataset_name = 'QQA23_TaskA_dev.tsv'
with open(dataset_name, 'r') as f:
    text = f.read()

paragraphs = []
current_paragraph = ""
sentences = text.split('\n')

In [None]:
ids = []
questions = []
result = []
for sentence in sentences:
    words = sentence.split("\t")
    if(len(words) == 2):
      retreived, scores = retrieve(words[1],document_embeddings,
    df)
      if len(scores) == 0:
        result.append([words[0], "Q0", -1, 1, 1, 'Abdul'])
      else:
        if scores[0] < threshold:
          result.append([words[0], "Q0", -1, 1, 1, 'Abdul'])
        else:
          for i in range(len(retreived)):
            if(scores[i] < threshold):
              break
            result.append([words[0], "Q0", retreived[i], i + 1, scores[i], 'Abdul'])
df_run = pd.DataFrame(result, columns=['qid', 'Q0', 'docno', 'rank', 'score', 'tag'])
df_run[["qid", "Q0", "docno", "rank", "score", "tag"]].to_csv('AlJawaab_emb.tsv', sep="\t", index=False, header=False)

In [None]:
# evaluation dev or training datasets
! python QQA23_TaskA_eval.py \
    -r "AlJawaab_emb.tsv" \
    -q "GOLD_ANSWERS_FILE.gold"

In [None]:
# checking the test answers file
! python QQA23_TaskA_submission_checker.py \
    --model-prediction \
    "AlJawaab_emb.tsv"
