# This Notebook is designed to work on Google Colab

# Config

In [None]:
# Installing the required libraries and datasets
!git clone https://gitlab.com/bigirqu/quranqa.git
!git clone https://github.com/aub-mind/arabert.git
!pip install farasapy
!pip install simpletransformers

In [None]:
# Simple Transformers need to be installed again to avoid issues with restarting the kernel
# Also, weneed to set the setuptools to this version to be compatibale with PyTorch and Simple Transformers
!pip install simpletransformers
!pip install setuptools==57.4.0

In [None]:
# Import the required libraries and packages
# Instatiate the Farasa Segmenter and assign the model name and trype for Simple Transformers

from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs
import torch
import json, argparse
from collections import Counter
import re
import string
import logging
import os
import numpy as np
import pandas as pd
from farasa.segmenter import FarasaSegmenter
farasa_segmenter = FarasaSegmenter()
model_name = 'aubmindlab/bert-large-arabertv02'
model_type = 'bert'


In [None]:
# Here are the functions from the Qur'an QA scripts
# Please note that we changed the PassageQuestion class to be compatible with Simple Transformers format
# Please note that we changed the function of dump_jsonl to accommodate the answers to be in dictionaries and save it in json format that is compatible with the shared-task requirement.

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps({line:data[line]}, ensure_ascii=False)
            # json_record.replace("{","",1)
            # json_record[0] == "#"
            # re.sub("")
            # print(type(json_record[0]))
            s = list(json_record)
            s[0] = ''
            s[-1] = ','
            json_record = "".join(s)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))



class PassageQuestion():
    def __init__(self,dictionary) -> None:
        self.pq_id = None
        self.passage = None
        self.surah = None
        self.verses = None
        self.question = None
        self.answers = []
        self.pq_id = dictionary["pq_id"]
        self.passage = dictionary["passage"]
        self.surah = dictionary["surah"]
        self.verses = dictionary["verses"]
        self.question = dictionary["question"]
        for answer in dictionary["answers"]:
            self.answers.append(Answer(answer))

    def to_dict(self) -> dict:
        passge_question_dict = {
        "context":self.passage,
        "qas": [{
        "id":self.pq_id,
        "surah":self.surah,
        "verses":self.verses,
        "question":self.question,
        "answers":[x.to_dict() for x in self.answers]}]
        }
        return passge_question_dict

class Answer():
    def __init__(self,dictionary) -> None:
        self.text = dictionary["text"]
        self.start_char = dictionary["start_char"]

    def to_dict(self) -> dict:
        answer_dict = {
        "text":self.text,
        "answer_start":self.start_char
        }
        return answer_dict


def read_JSONL_file(file_path) -> list:
    data_in_file = load_jsonl(file_path)

    # get list of PassageQuestion objects
    passage_question_objects = []
    for passage_question_dict in data_in_file:
        # instantiate a PassageQuestion object
        pq_object = PassageQuestion(passage_question_dict)
        passage_question_objects.append(pq_object)

    print(f"Collected {len(passage_question_objects)} Object from {file_path}")
    return passage_question_objects



In [None]:
# Checking the Google Colab using cuda instead of cpu.

if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Opening the files

In [None]:
# Loading the files and assign them to variables

train = load_jsonl("/content/quranqa/datasets/qrcd_v1.1_train.jsonl")
dev = load_jsonl("/content/quranqa/datasets/qrcd_v1.1_dev.jsonl")
test = load_jsonl("/content/quranqa/datasets/qrcd_v1.1_test_noAnswers.jsonl")

In [None]:
train_data = []
for item_dict in np.arange(0,len(train)):
    change_items = PassageQuestion(train[item_dict])
    final_change = change_items.to_dict()
    train_data.append(final_change)

In [None]:
dev_data = []
for item_dict in np.arange(0,len(dev)):
    change_items = PassageQuestion(dev[item_dict])
    final_change = change_items.to_dict()
    dev_data.append(final_change)

In [None]:
test_data = []
for item_dict in np.arange(0,len(test)):
    change_items = PassageQuestion(test[item_dict])
    final_change = change_items.to_dict()
    test_data.append(final_change)

# Simple Transformers parameters and instatiate the model

In [None]:
# Set the model's arguments/parameters for the training
models_args = QuestionAnsweringArgs()
models_args.train_batch_size = 15
models_args.evaluate_during_training = True
models_args.n_best_size = 5
models_args.save_model_every_epoch = False
models_args.save_steps = -1
models_args.learning_rate = 0.0001
models_args.num_train_epochs = 5
models_args.manual_seed = 109
models_args.output_dir = "/content/output/"

In [None]:
# Instatiate the model and set the "args" to model_args variable
model = QuestionAnsweringModel(
    model_type, model_name, args= models_args
)

# Training the model and predicting the results

In [None]:
model.train_model(train_data=train_data, eval_data = dev_data)

In [None]:
result, texts = model.predict(test_data, n_best_size=5)

# Preparing the run file

In [None]:
# Create the dictionary to combine the answers and their probability scores
# If the list has more than 1 answer, then consider the ranks from 1 till the end of the loop
# Else let the rank be just 1

submit_test_dict = {}

for n,z in zip(result, texts):
  
  temp_list = []
  if len(n['answer']) > 1:
    rank = 1
    for m,y in zip(n['answer'], z['probability']):
      temp_dict = {}
      temp_dict.update({
          'answer': m,
          'rank': rank,
          'score': y
      })
      temp_list.append(temp_dict)
      rank = rank + 1
    
    submit_test_dict.update({
        n['id']: temp_list
    })
  
  else:
    for m,y in zip(n['answer'], z['probability']):
      temp_dict = {}
      temp_dict.update({
          'answer': m,
          'rank': 1,
          'score': y
      })
      temp_list.append(temp_dict)
    
    submit_test_dict.update({
        n['id']: temp_list
    })
  


In [None]:
# Check of the length of the created dictionary equals the test set
len(submit_test_dict) == len(test_data)

In [None]:
# Count and remove the empty answers except if the empty answer is the only one to avoid the error in the evaluation script.
count = 0
for id_key in submit_test_dict:
  for i, small_D in enumerate(submit_test_dict[id_key]):
    if len(submit_test_dict[id_key]) > 1 and small_D['answer'] == '':
      
      count += 1
      submit_test_dict[id_key].remove(small_D)
    
    elif len(submit_test_dict[id_key]) > 1 and small_D['answer'] == 'empty':

      count += 1
      submit_test_dict[id_key].remove(small_D)

 
for id_key in submit_test_dict:
  c = 1
  for i, small_D in enumerate(submit_test_dict[id_key]):
    small_D['rank'] = c 
    c += 1
  
count

In [None]:
# Write and save the file in json format
dump_jsonl(submit_test_dict, "LK2022_run21.json")

## Now, please consider the following prior running the file on the submission checker script:
- Open the json file, and 
- add the opening curly bracket "{" at the beginning of the file, and 
- remove the comma "," at the end of the file, and
- add a closing curly bracket "}" at the end of the file.

# Run the file on the submission check script

In [None]:
!python /content/quranqa/code/quranqa22_submission_checker.py --run_file "/content/LK2022_run21.json"