In [2]:
import math
import os
import re


import nltk
import numpy as np
import pandas as pd
import pyterrier as pt
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, InputExample
from sentence_transformers import evaluation
from sentence_transformers import models, losses, datasets

from torch import nn

from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []
index=load_index("C:/Users/LEGION/OneDrive - University Of Jordan/GP/quran-qa-2023-main-Task-A/quran-qa-2023-main-Task-A/Task-A/data/QPC_Index/data.properties")

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.6



Index was loaded successfully from this path:  C:/Users/LEGION/OneDrive - University Of Jordan/GP/quran-qa-2023-main-Task-A/quran-qa-2023-main-Task-A/Task-A/data/QPC_Index/data.properties


In [8]:
df_passage=pd.read_csv('../QuestionEncoding/Task A Data/passage.csv')
df_query_dev=pd.read_csv('../QuestionEncoding/Task A Data/questionsDev.csv')
df_qppair_dev=pd.read_csv('../QuestionEncoding/Task A Data/pairsDev.csv')
df_query_train=pd.read_csv('../QuestionEncoding/Task A Data/questionsTrain.csv')
df_query_test=pd.read_csv('../QuestionEncoding/Task A Data/questionsTest.csv')

In [21]:
df_query_train

Unnamed: 0,qid,text,query
0,101,من هم قوم شعيب؟,هم قوم شعيب
1,102,من هم قوم موسى؟,هم قوم موسي
2,103,من بنى الكعبة؟,بني كعبه
3,105,من هو النبي المعروف بالصبر؟,هو نبي معروف صبر
4,106,من كفل السيدة مريم؟,كفل سيد مريم
...,...,...,...
169,422,ما هي الأماكن التي ذُكرت في القرآن كأماكن مقدسة؟,ما هي مكان الذي ذكر قران مكان مقدس
170,423,لماذا لم يتم حذف الآيات المنسوخة من القرآن؟,لماذا لم تم حذف ايه منسوخ قران
171,425,هل سيدنا محمد هو أفضل الأنبياء؟,هل سيد محمد هو افضل نبي
172,426,هل حذر القرآن المؤمنين من اتخاذ أهل الكتاب أول...,هل حذر قران مءمن اتخاذ اهل كتاب ولي ل


In [None]:
df_passage

In [8]:
#Pretraining our bert sentence transformer model
def run_simcse(model_name, batch_size, num_epochs, save_name, train_samples_qp_task_B=None):

    word_embedding_model = models.Transformer(model_name, max_seq_length=128)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    train_objectives = []

    train_sentences = df_passage['passage'].tolist()

    train_data = [InputExample(texts=[s, s]) for s in train_sentences]
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    train_objectives.append((train_dataloader, train_loss))

    if train_samples_qp_task_B:
        train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, batch_size=batch_size, shuffle=True)
        constraint_loss = losses.ContrastiveLoss(model)
        train_objectives.append((train_samples_qp_task_B_dataloader, constraint_loss))

    model.fit(
        train_objectives=train_objectives,
        epochs=num_epochs)

    model_name = os.path.join(f'/kaggle/working/fine_tune/simcse-model-{save_name}')

    model.save(model_name)

    return model_name

In [None]:
#Defining our final model structure
def build_biencoder(sentence_embedder, max_seq_len):
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(sentence_embedder, max_seq_length=max_seq_len)
    print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
    print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=512,
                               activation_function=nn.Tanh())
    bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

    return bi_encoder

In [7]:
#Finetuning our model
def train_biencoder(bi_encoder, train_objectives, batch_size, num_epochs, warmup_steps):
    torch.cuda.empty_cache()

    dev_evaluator = evaluation.InformationRetrievalEvaluator(
        df_query_dev.groupby('qid')['query'].apply(str).to_dict(),
        df_passage.groupby('pid')['passage'].apply(str).to_dict(),
        df_qppair_dev.groupby('qid')['docid'].apply(set).to_dict(),
        accuracy_at_k=[10],
        precision_recall_at_k=[10],
        map_at_k=[10], mrr_at_k=[10]
    )

    # multi-task training
    print("train_batch_size", batch_size)
    bi_encoder.fit(
        train_objectives=train_objectives,
        evaluator=dev_evaluator,
        epochs=num_epochs,
        evaluation_steps=301,
        warmup_steps=warmup_steps,
        output_path="/kaggle/working/fine_tuned_encoder"
    )

    return bi_encoder

In [2]:
def encode_by_biencoder(encoder):
    passage_embeddings = encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True)
    query_train_embeddings = encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True)
    query_dev_embeddings = encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True)
    query_tset_embeddings = encoder.encode(df_query_test['query'].tolist(), convert_to_tensor=True)

    # won't be returned therefore ignored
    # df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
    # df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
    # df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()
    # df_query_test['embedding'] = query_tset_embeddings.cpu().numpy().tolist()

    return passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings

In [10]:
def gym_run2():
    model_name = 'distilbert-base-multilingual-cased'
    sentence_embedder_name = run_simcse(model_name, batch_size=32, num_epochs=40, save_name="run2")

    max_seq_len = 128
    bi_encoder = build_biencoder(sentence_embedder_name, max_seq_len)

    train_batch_size = 16
 
    triplets=pd.read_csv('/kaggle/input/dataset1/FarasaDataSet/trainTripletsFarasa.csv')
    train_samples_qp_triple =[]
    for i in range(len(triplets)):
        train_samples_qp_triple.append(InputExample(texts=[triplets['question'].iloc[i],triplets['relPassage'].iloc[i],triplets['irrelPassage'].iloc[i]]))
    train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)

    train_biencoder_loss_Triple = losses.TripletLoss(bi_encoder,distance_metric =TripletDistanceMetric.COSINE)

    num_epochs = 5
    warmup_steps = math.ceil(len(train_qp_triple_dataloader) * num_epochs * 0.1)  #10% of train data for warm-up

    train_objectives = [
        (train_qp_triple_dataloader, train_biencoder_loss_Triple)
    ]
    bi_encoder = train_biencoder(bi_encoder, train_objectives, train_batch_size, num_epochs, warmup_steps)
    # model_save_path = os.path.join(data_path, f'model/biencoder-simsce-run2')

    passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings = encode_by_biencoder(
        bi_encoder)


    hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=10)
    return bi_encoder


In [3]:
encoder=SentenceTransformer('../QuestionEncoding/Encoder_Model_run2/',128,device='cuda')

In [6]:
BM25_model = pt.BatchRetrieve(index, controls={"wmodel": "BM25"}, num_results=10)

In [9]:
bm25_biencoder_hit = []

for query in df_query_dev['query'].tolist():
    bm25_result = BM25_model.search(query)
    bm25_related_passage = bm25_result['docno'].tolist()
    passage = df_passage[df_passage['pid'].isin(bm25_related_passage)]['passage'].tolist()

    try:
        query_embedding = encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage_embeddings = encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        # TODO
        '''
        check similarity between query and passage, with methods is better for this task?
        util.dot_score
        util.cos_sim
        util.pairwise_dot_score
        util.pairwise_cos_sim
        '''
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=10)[0]
        mapping = {index: row['docno'] for index, row in bm25_result.iterrows()}

    except:
        #len passage is 0 but why ?
        print(f"len passage : {len(passage)}, qury : {query}")
        query_embedding = encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage = df_passage['passage'].tolist()
        passage_embeddings = encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)

        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]
        mapping = {index: row['pid'] for index, row in df_passage.iterrows()}

    for i in range(len(hit)):
        hit[i]['corpus_id'] = mapping[hit[i]['corpus_id']]
    hit = sorted(hit, key=lambda x: x['score'], reverse=True)
    bm25_biencoder_hit.append(hit)

In [10]:

def save_query_passage_retrieval(result, tag, run_save=False, df_query=df_query_train, top_k=10):
    if "bienc_" in tag:
        np_result = np.array(result).flatten()
        result = pd.DataFrame()

        result["qid"] = df_query["qid"].tolist() * top_k
        result = result.sort_values(by=['qid']).reset_index(drop=True)
        result["Q0"] = ["Q0"] * len(result)
        result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]
        result["rank"] = list(range(1, top_k + 1)) * len(df_query)
        result["score"] = [x['score'] for x in np_result]
        result["tag"] = [tag] * len(np_result)

    elif tag == "SimCSE_bmbiencd":
        df_result = pd.DataFrame()
        for i in range(len(bm25_biencoder_hit)):
            for j in range(len(bm25_biencoder_hit[i])):
                new_record = pd.DataFrame([{"qid": df_query_dev['qid'].tolist()[i],
                                            "Q0": "Q0",
                                            "pid": bm25_biencoder_hit[i][j]['corpus_id'],
                                            "rank": j,
                                            "score": bm25_biencoder_hit[i][j]['score'],
                                            "tag": tag
                                            }])
                df_result = pd.concat([df_result, new_record], ignore_index=True)
        result = df_result
        print(type(result))

    elif tag == "BM25":
        result["Q0"] = ["Q0"] * len(result)
        result["tag"] = [tag] * len(result)
        result['qid'] = result["qid"]
        result['pid'] = result["docno"]
        tag = "BM25_Final"
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    elif tag == "biencoder_cross":
        result['tag'] = tag
        result['Q0'] = 'Q0'
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    if run_save:
        run_save_path = os.path.join(data_path, f"runs/{tag}.tsv")
        # print(run_save_path)
        result.to_csv('results.csv', sep="\t", index=False, header=False)

    return result

In [11]:
def evaluate_biencoder(query_embeddings, passage_embeddings, df_query, tag):
    hits = util.semantic_search(query_embeddings, passage_embeddings, top_k=10)
    df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query, top_k=10)
    return df_run

In [12]:
passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings = encode_by_biencoder(encoder)

hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=10)

In [6]:
from sentence_transformers.losses import TripletDistanceMetric

In [7]:
TripletDistanceMetric.COSINE

<function sentence_transformers.losses.TripletLoss.TripletDistanceMetric.<lambda>(x, y)>

In [58]:
a=[x for x in np_result]

In [57]:
a=[x['corpus_id'] for x in a]

In [50]:
len(a)

520

In [59]:
df_passage.iloc[[x['corpus_id'] for x in a]]['pid']

612       19:1-11
659      21:83-86
661      21:89-90
898      38:41-44
829      33:41-44
          ...    
1233     92:12-21
153     3:190-195
545      16:35-40
1037     52:48-49
1116       67:1-5
Name: pid, Length: 520, dtype: object

In [52]:
result["qid"] = df_query_train["qid"].tolist()*10
result = result.sort_values(by=['qid']).reset_index(drop=True)
result

Unnamed: 0,qid
0,101
1,101
2,101
3,101
4,101
...,...
1735,427
1736,427
1737,427
1738,427


In [13]:
df_run = save_query_passage_retrieval(hits, tag="bienc_train2", df_query=df_query_train, top_k=10)
df_run

ValueError: Length of values (520) does not match length of index (1740)

In [18]:
m=re.match('ومن يطع الله والرسول فأولئك مع الذين أنعم الله عليهم','يا أيها الذين آمنوا لا تكونوا كالذين آذوا موسى فبرأه الله مما قالوا وكان عند الله وجيها. يا أيها الذين آمنوا اتقوا الله وقولوا قولا سديدا. يصلح لكم أعمالكم ويغفر لكم ذنوبكم ومن يطع الله ورسوله فقد فاز فوزا عظيما.')

In [None]:

result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]

In [19]:
if(m):
    print('Lol')