# Config

In [None]:
!pip install sentence_transformers
!pip install pyterrier
!pip install Arabic-Stopwords
!pip install pytrec_eval


In [None]:
import numpy as np
import pandas as pd
import re
import pandas as pd
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as ar_stp
import pyterrier as pt
# pyterrier is a Python API for Terrier. Link: https://github.com/terrier-org/pyterrier
# Terrier IR Platform is a modular open source software for the rapid development of large-scale information retrieval applications.
# if not pt.started():
#     pt.init(helper_version="0.0.6")

import json
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets,  CrossEncoder, util
from transformers import AutoTokenizer
import gzip
import os
import torch

if not torch.cuda.is_available():
    print("Warning: No GPU found. Please add GPU to your notebook")

from sklearn.feature_extraction import _stop_words
import string
from tqdm.autonotebook import tqdm

from torch import nn
import os
import time


# Load the data

## Training set for training the model



In [None]:
# The train data for Quran QA 2023
# I already matched the Questions with Passages (Texts)
df = pd.read_csv("Quran_QA_2023/Task-A/train_data.csv")
df.head()

In [None]:
# All data from Quran 2022
# I already aggragate the train, dev and test into one file.
df2 = pd.read_csv("Quran_QA_2022/data/QA_2022.csv")
df2.head()

In [None]:
train_examples = []
for n in np.arange(0, len(df)):
  train_examples.append(InputExample(texts=[str(df['Question'][n]), str(df['Passage'][n])]))

for n in np.arange(0, len(df2)):
  train_examples.append(InputExample(texts=[str(df2['questions'][n]), str(df2['answers'][n])]))

# For the MultipleNegativesRankingLoss, we use NoDuplicatesDataLoader to make sure no duplicated records
train_dataloader = datasets.NoDuplicatesDataLoader(train_examples, batch_size=16)


## Test set

In [None]:
df_validation = pd.read_csv("quran-qa-2023-main/Task-A/data/QQA23_TaskA_test.tsv", sep = '\t', names = ['ID_Question', 'Question'])
df_validation.head()

In [None]:
# All question IDs are into the list, so later we can use the search function.
questions_list = df_validation['ID_Question'].tolist()

## QPC data to later encoded by the model

In [None]:
df_val = pd.read_csv("quran-qa-2023-main/Task-A/data/Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv", sep='\t', names=['ID', 'Passage'])
df_val.head()


In [None]:
passages_val = []
for n in df_val['Passage']:
  # print(n)
  passages_val.append(str(n))

print(len(passages_val))

# Train the model

In [None]:
bi_encoder = SentenceTransformer('qahq/CL-AraBERTv0.1-base')
bi_encoder.max_seq_length = 512
top_k = 32

# We use a cross-encoder, to re-rank the bi-encoder's results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1')

In [None]:
# Set up the model
word_emb = models.Transformer('qahq/CL-AraBERTv0.1-base', max_seq_length=512)
pooling = models.Pooling(word_emb.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_emb, pooling])

# Train loss
train_loss = losses.MultipleNegativesRankingLoss(model)

In [None]:
num_epochs = 5
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          show_progress_bar=True)

In [None]:
model.save('CL_AraBERT')

# Retrieve Relevant Passages

In [None]:
# If you want to load the model again
model = SentenceTransformer('CL_AraBERT')

In [None]:
# encode the QPC
corpus_embeddings = model.encode(passages_val, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
# Arabic Preprocessing functions LIGHTLY
# arabic stemmer
ar_stemmer = stemmer("arabic")


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    # text = re.sub("ة", "ه", text)
    return(text)

def deNoise(text):
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, '', text)
    return text

def arabic_preprocessing(text):
  text = normalize_arabic(text)
  text = deNoise(text)
  return text


In [None]:
# Example
x = df_validation['Question'][0]
print(x)
arabic_preprocessing(x)

من ماذا نجى الله سبحانه وتعالى سيدنا إبراهيم؟


'من ماذا نجي الله سبحانه وتعالي سيدنا ابراهيم؟'

In [None]:
# Function to retrieve the semantically relevant answers using bi-encoders

def search(num):

    questions = []

    answers_bi = []
    score_bi = []
    passage_bi = []
    rank_bi = []
    count_bi = 1


    ##### get the question from index
    query = df_validation.loc[df_validation['ID_Question'] == num, 'Question'].iloc[0]
    query = arabic_preprocessing(query)
    print("Input question:", query)

    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = model.encode(query, convert_to_tensor=True)
    question_embedding = question_embedding.cuda()
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    # Get the top 20 answers
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:20]:
      questions.append(query)
      answers_bi.append(passages_val[hit['corpus_id']])
      score_bi.append(hit['score'])
      rank_bi.append(count_bi)
      count_bi += 1
      if passages_val[hit['corpus_id']] == 'nan':
        passage_bi.append('-1')
      else:
        passage_id_val = df_val.loc[df_val['Passage'] == passages_val[hit['corpus_id']], 'ID'].iloc[0]
        passage_bi.append(passage_id_val)

    ##### Save it to a dafataframe #####
    df_answers = pd.DataFrame({
        'Question': questions,
        'answers_bi': answers_bi,
        'rank_bi': rank_bi,
        'score_bi': score_bi,
        'passage_bi': passage_bi
    })

    df_answers['Q0'] = 'Q0'
    df_answers['q_id'] = num

    return df_answers

In [None]:
testing_results = search(500)
testing_results

Input question: من ماذا نجي الله سبحانه وتعالي سيدنا ابراهيم؟


In [None]:
for q in questions_list:
  if q == 500:
    print(q)
    df_results = search(q)
    df_results_merge = pd.concat([df_results])
    continue
  else:
    print(q)
    df_results_again = search(q)
    df_results_merge = pd.concat([df_results_merge, df_results_again])

In [None]:
df_results_merge.reset_index(inplace=True)

In [None]:
df_results_bi = df_results_merge[['q_id', 'Q0', 'passage_bi', 'rank_bi', 'score_bi']]

In [None]:
df_results_bi['run'] = 'LKAU23_run60_CL_AraBERT'
df_results_bi

In [None]:
df_results_bi.to_csv("LKAU23_run60_CL_AraBERT.tsv", sep='\t', index=False, header=False)


In [None]:
!python quran-qa-2023-main/Task-A/code/QQA23_TaskA_submission_checker.py \
-m 'LKAU23_run60_CL_AraBERT.tsv'