In [None]:
# for training
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install fuzzywuzzy python-Levenshtein
!pip install bitsandbytes==0.41.3
!pip install -q -U sentencepiece
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install nltk==3.5.0
!pip install bert-score
!pip install rouge
#for inference time
!pip install PyPDF2
!pip install keybert
!pip install keybert[use]
!pip install keybert[spacy]
!pip install keybert[flair]
!pip install keybert[gensim]
!pip install sense2vec==2.0.1
!python -m spacy download en_core_web_sm
!pip install git+https://github.com/boudinfl/pke.git
!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
!tar -xvf  s2v_reddit_2015_md.tar.gz

In [None]:
import os
import torch

os.environ["WANDB_DISABLED"] = "true"
if torch.cuda.is_available()==True :
    os.environ["CUDA_VISIBLE_DEVICES"]="0"

# #########################################

# For Inference Time

### Cleaning the data

In [None]:
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
stop_words = stopwords.words('english')
# arabic_stopwords = stopwords.words('arabic')

In [None]:
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return text.encode('ascii','ignore').decode()
    #return ''.join(char for char in text if char.isalpha() and char.isnumeric() or 'ARABIC' in unicodedata.name(char, ''))

def remove_brackets_num(text):
    return re.sub("\*?","",text)

def to_lowercase(text):
    return text.lower()

def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+','',text)

def remove_whitespace(text):
      return text.strip()

def remove_punctuation(text):
    punctuation= '''!()[]{};:'"\<>/?$%^&*_`~='''
    for punc in punctuation:
        text=text.replace(punc,"")
    return text

def remove_emails(text):
    return re.sub(r'[A-Za-z0-9]*@[A-Za-z]*\.?[A-Za-z0-9]*', "", text)

def text2words(text):
    return word_tokenize(text)

def remove_stopwords(words,stop_words):
    return [word for word in words if word not in stop_words]


def normalize_text(text):
    # text = remove_non_ascii(text)
    text= remove_brackets_num(text)
    text = to_lowercase(text)
    #text=replace_numbers(text)
    text= remove_whitespace(text)
    text = remove_punctuation(text)
    text= remove_emails(text)
    words = text2words(text)
    #words = remove_stopwords(words, stop_words)

    return ' '.join(words)

In [None]:
N_text=normalize_text(text)
N_text

### Generate KeyWords

In [None]:
from keybert import KeyBERT
kw_tool = KeyBERT()

In [None]:
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# def jaccard_similarity(word1, word2):
#     set1 = set(word1)
#     set2 = set(word2)

#     intersection = len(set1.intersection(set2))
#     union = len(set1.union(set2))

#     similarity = intersection / union if union > 0 else 0
#     return similarity

def calculate_similarity(sentence1, sentence2):
    # Initialize Porter Stemmer
    stemmer = PorterStemmer()

    # Tokenize and stem the sentences
    stemmed_sentence1 = ' '.join([stemmer.stem(word) for word in sentence1.split()])
    stemmed_sentence2 = ' '.join([stemmer.stem(word) for word in sentence2.split()])

    # Convert the stemmed sentences into vectors
    vectorizer = CountVectorizer().fit([stemmed_sentence1, stemmed_sentence2])
    vectorized_sentences = vectorizer.transform([stemmed_sentence1, stemmed_sentence2])

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(vectorized_sentences)[0][1]

    return cosine_sim


def extract_keywords_from_text1(text):
    # Extract keywords from the given text
    KeyBERT1 = kw_tool.extract_keywords(text, keyphrase_ngram_range=(1,1), top_n=10)
    KeyBERT2 = kw_tool.extract_keywords(text, keyphrase_ngram_range=(2,2), top_n=10)

    # Combine all extracted keywords
    all_keywords = [key[0] for key in KeyBERT1] + \
                   [key[0] for key in KeyBERT2]
    # Filter out empty keywords
    all_keywords = [keyword for keyword in all_keywords if keyword]

    # Filter out very similar keywords
    similarity_threshold_between_keywords = 0.4  # Threshold for similarity between keywords
    unique_keywords = []
    for keyword in all_keywords:
        if all(calculate_similarity(keyword, existing_keyword) < similarity_threshold_between_keywords for existing_keyword in unique_keywords):
            unique_keywords.append(keyword)

    return unique_keywords

In [None]:
unique_keywords1=extract_keywords_from_text1(N_text)
unique_keywords1

In [None]:
import pke

def extract_keywords_from_text2(text):
    # Initialize keyphrase extraction model, here TopicRank
    extractor = pke.unsupervised.TopicRank()

    # Load the content of the document
    extractor.load_document(input=text, language='en')

    # Keyphrase candidate selection: in the case of TopicRank: sequences of nouns
    # and adjectives (i.e., `(Noun|Adj)*`)
    extractor.candidate_selection()

    # Candidate weighting: using a random walk algorithm
    extractor.candidate_weighting()

    # N-best selection, keyphrases contains the 10 highest scored candidates
    keyphrases = extractor.get_n_best(n=20)

    # Extract keyphrases
    keywords = [keyphrase for keyphrase, score in keyphrases]

    # Calculate similarity with keywords
    unique_keywords = []

    # Handling unigrams and bigrams separately
    unigrams = [keyphrase for keyphrase in keywords if len(keyphrase.split()) == 1]
    bigrams = [keyphrase for keyphrase in keywords if len(keyphrase.split()) == 2]

    # Add unigrams to unique_keywords directly
    unique_keywords.extend(unigrams)

    # Filter bigrams based on similarity with existing keywords
    for keyphrase in bigrams:
        similarity = calculate_similarity(keyphrase, ' '.join(keywords))
        if similarity < 0.4:  # Adjust the similarity threshold as needed
            unique_keywords.append(keyphrase)

    return unique_keywords

In [None]:
unique_keywords2=extract_keywords_from_text2(N_text)
unique_keywords2

### Generate Distractors

In [None]:
from sense2vec import Sense2Vec
# load sense2vec vectors
s2v = Sense2Vec().from_disk('s2v_old')

In [None]:
from collections import OrderedDict

def sense2vec_get_words(word, s2v):
    output = []
    word = word.lower()

    sense = s2v.get_best_sense(word)
    similarity_threshold = 0.3
    out = []

    if sense is not None:
        most_similar = s2v.most_similar(sense, n=20)
        for sim in most_similar:
            append_word= sim[0].split("|")[0].replace("_", " ").lower()

            # Check similarity with keyword
            similarity_keyword = calculate_similarity(word, append_word)
            #print(f"Similarity between '{word}' and '{append_word}': {similarity_keyword}")

            # Check if similarity with keyword is above the threshold
            if similarity_keyword >= similarity_threshold:
                continue

            # Check similarity with existing distractors
            similarity_to_existing = [calculate_similarity(append_word, existing_distractor) for existing_distractor in output]

            # Check if similarity with any existing distractor is above the threshold
            if any(similarity >= similarity_threshold for similarity in similarity_to_existing):
                continue

            # If the conditions are met, append the word to the list of output
            output.append(append_word.title())

        out = list(OrderedDict.fromkeys(output))
    return out[:3]

In [None]:
for word in unique_keywords:
    existing_distractor=sense2vec_get_words(word, s2v)
    print(word , existing_distractor)

### At Inference Time

In [None]:
#general question generation model
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
HUGGING_FACE_USER_NAME='mou3az'
model_name='QuestionGeneration'
peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
G_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
G_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
def generate_questions(context, answer, distractors):
    device = next(G_model.parameters()).device
    input_text = f"Given the context '{context}' and the answer '{answer}' , what question can be asked?"
    encoding = G_tokenizer.encode_plus(input_text, padding=True,truncation=True, return_tensors="pt").to(device)

    output_tokens = G_model.generate(**encoding, early_stopping=True, num_beams=5, num_return_sequences=1, no_repeat_ngram_size=2, max_length=200)
    question = G_tokenizer.decode(output_tokens[0], skip_special_tokens=True).replace("question:", "").strip()
    return question

In [None]:
def process_and_generate_questions(example):
    N_text_file, keywords, keyword_question_distractors = "", [], []

    # Check if example is not None and not empty
    if example is not None and example.strip():
        # Remove empty lines between sentences
        cleaned_text = '\n'.join(line.strip() for line in example.split('\n') if line.strip())

        # Concatenate lines and separate them with a period
        concatenated_text = '.'.join(cleaned_text.split('\n'))

        # Tokenize the text using G_tokenizer
        tokens = G_tokenizer.tokenize(concatenated_text)

        # Check if the number of tokens is less than 1024
        if len(tokens) < 1024: # or 512 for google-flan-t5
            # Normalize the concatenated text
            N_text_file = normalize_text(concatenated_text)

            # Check if N_text_file has text
            if N_text_file:
                keywords = extract_keywords_from_text(N_text_file)
                if keywords:
                    for word in keywords:
                        current_distractors = sense2vec_get_words(word, s2v)
                        if current_distractors:
                            question = generate_questions(N_text_file, word)
                            keyword_question_distractors.append((word, current_distractors, question))
                else:
                    print("No keywords generated.")
            else:
                print("No text available.")
        else:
            print("The tokenized text has more than 1024 tokens.")
    else:
        print("The file is empty or large.")
        # Handle the case where the file is empty or cannot be loaded

    result = [(N_text_file, keyword_question_distractors)]

    return result

# Example usage:
txt= = ""
Total_List = process_and_generate_questions(txt)

# Print the context, keyword, question, and distractors
for context, keyword_question_distractors in Total_List:
    if context:
        print(f"context: {context}")
        print()
        for keyword, distractors, question in keyword_question_distractors:
            print(f"Keyword: {keyword}")
            print(f"Question: {question}")
            print(f"Distractors: {distractors}")
            print()

# ###########################################

# For Training Time

### loading the model and the tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig

model_name_or_path = "facebook/bart-base" # google/flan-t5-base

# Define the quantization configuration with 4-bit
# quantization_config = BitsAndBytesConfig(bit_width=4, bnb_4bit_compute_type="torch.float16")

# Load the model with the specified quantization configuration
G_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float32,
    device_map='auto',
)
G_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
Lora_config = LoraConfig(
    r=18,
    lora_alpha=12,
    target_modules=["q_proj", "v_proj"], # or q and v for other models
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

L_model = get_peft_model(G_model, Lora_config)
print(L_model.print_trainable_parameters())

In [None]:
from datasets import Dataset, load_dataset, concatenate_datasets

# Load and preprocess General English data from SQuAD v2 for training set
en_g_train = (
    load_dataset("squad_v2", split="train")
    .filter(lambda example: example["context"] is not None
            and example["question"] is not None
            and example["answers"]["text"] is not None
            and len(example["answers"]["text"]) > 0
            and len(example["context"]) > 0
            and len(example["question"]) > 0
            and len(example["context"].split()) < 780
            and len(example["question"].split()) < 780)
    .shuffle(seed=42)
    .select(range(20000))
    .map(lambda example: {"context": example["context"], "question": example["question"], "answer": example["answers"]["text"][0]})
    .remove_columns(["id", "title", "answers"])
)

# Load and preprocess General English data from SQuAD v2 for validation set
en_g_validation = (
    load_dataset("squad_v2", split="validation")
    .filter(lambda example: example["context"] is not None
            and example["question"] is not None
            and example["answers"]["text"] is not None
            and len(example["answers"]["text"]) > 0
            and len(example["context"]) > 0
            and len(example["question"]) > 0
            and len(example["context"].split()) < 780
            and len(example["question"].split()) < 780)
    .shuffle(seed=42)
    .select(range(1000))
    .map(lambda example: {"context": example["context"], "question": example["question"], "answer": example["answers"]["text"][0]})
    .remove_columns(["id", "title", "answers"])
)

# Load and preprocess Different English data from DROPOUT
en_d_train = (
    load_dataset("drop", split="train")
    .filter(lambda example: example["passage"] is not None
            and example["question"] is not None
            and example["answers_spans"]["spans"] is not None
            and len(example["answers_spans"]["spans"]) > 0
            and len(example["passage"]) > 0
            and len(example["question"]) > 0
            and len(example["passage"].split()) < 780
            and len(example["question"].split()) < 780)
    .shuffle(seed=42)
    .select(range(20000))
    .map(lambda example: {"context": example["passage"], "question": example["question"], "answer": example["answers_spans"]["spans"][0]})
    .remove_columns(["section_id", "query_id", "answers_spans", "passage"])
)

en_d_validation = (
    load_dataset("drop", split="validation")
    .filter(lambda example: example["passage"] is not None
            and example["question"] is not None
            and example["answers_spans"]["spans"] is not None
            and len(example["answers_spans"]["spans"]) > 0
            and len(example["passage"]) > 0
            and len(example["question"]) > 0
            and len(example["passage"].split()) < 780
            and len(example["question"].split()) < 780)
    .shuffle(seed=42)
    .select(range(1000))
    .map(lambda example: {"context": example["passage"], "question": example["question"], "answer": example["answers_spans"]["spans"][0]})
    .remove_columns(["section_id", "query_id", "answers_spans", "passage"])
)

# Load and preprocess additional English data
en_additional = (
    load_dataset("mou3az/Question-Answering-Generation-Choices", split="train")
    .filter(lambda example: example["context"] is not None
            and len(example["context"]) > 0
            and len(example["context"].split()) < 780
            and example["question"] is not None
            and len(example["question"]) > 0
            and len(example["question"].split()) < 780
            and example.get("answer") is not None
            and len(example["answer"]) > 0
            and len(example["answer"].split()) < 6)
    .remove_columns(["distractors"])
)

# Split additional English data into training and validation sets
en_additional_train = en_additional.select(range(20000))
en_additional_validation = en_additional.select(range(20000, 21000))

# Concatenate all datasets
en_train = concatenate_datasets([en_g_train, en_d_train, en_additional_train])
en_validation = concatenate_datasets([en_g_validation, en_d_validation, en_additional_validation])

# Shuffle the datasets
en_train = en_train.shuffle(seed=123)
en_validation = en_validation.shuffle(seed=123)

In [None]:
#For General data
# import ast
def create_prompt1(context, answer):
    input_text = f"Given the context '{context}' and the answer '{answer}' , what question can be asked?"
    return input_text

def create_prompt2(question):
    output_text = f"question: {question}"
    return output_text

In [None]:
#for english data
en_train_data = en_train.map(lambda samples: G_tokenizer.encode_plus(create_prompt1(samples['context'], samples['answer']), padding=True), remove_columns=["context", "answer", "question","distractors"])
en_validation_data = en_validation.map(lambda samples: G_tokenizer.encode_plus(create_prompt1(samples['context'], samples['answer']), padding=True), remove_columns=["context", "answer", "question",'distractors'])
en_question_Tdata = en_train.map(lambda samples: G_tokenizer.encode_plus(create_prompt2(samples['question']), padding=True), remove_columns=["context", "answer", "question",'distractors'])["input_ids"]
en_question_Vdata = en_validation.map(lambda samples: G_tokenizer.encode_plus(create_prompt2(samples['question']), padding=True), remove_columns=["context", "answer", "question",'distractors'])["input_ids"]
en_train_data=en_train_data.add_column("labels", en_question_Tdata)
en_validation_data=en_validation_data.add_column("labels", en_question_Vdata)

In [None]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
      gradient_accumulation_steps=10,
      per_device_train_batch_size=45,
      per_device_eval_batch_size=45,
      # save_steps=2,
      eval_steps=150,
      warmup_steps=150,
      logging_steps=150,
      weight_decay=0.05,
      # save_total_limit=5,
      learning_rate=3e-3,
      max_steps=3000,
      # num_train_epochs=2,
      # load_best_model_at_end=True,
      # gradient_checkpointing=True,
      lr_scheduler_type="linear",
      do_train=True,
      do_eval=True,
      # fp16=False,
      report_to="all",
      log_level="debug",
      logging_dir='./logs',
      output_dir='./outputs',
      label_names=["labels"],
      evaluation_strategy="steps",
      # metric_for_best_model="eval_loss",
    )

trainer = Seq2SeqTrainer(
    model=L_model,
    args=training_args,
    tokenizer=G_tokenizer,
    train_dataset=en_train_data,
    eval_dataset=en_validation_data,
    # callbacks=[EarlyStoppingCallback(2, 1.0)],
    data_collator=DataCollatorForSeq2Seq(G_tokenizer,label_pad_token_id=-100),
)

# Additional configuration
L_model.config.use_cache = False
torch.cuda.empty_cache()
# L_model.config.bnb_8bit_compute_type = "torch.float16"

# Start training
trainer.train()

In [None]:
# to hugging face
model_name = "QuestionGeneration"
HUGGING_FACE_USER_NAME = "mou3az"

L_model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", token='')

In [None]:
# Save model checkpoint
L_model.save_pretrained("QuestionGeneration")
# Create a zip archive
!zip -r saved_model.zip QuestionGeneration

In [None]:
#general question generation model
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
HUGGING_FACE_USER_NAME='QuestionGeneration'
model_name='mou3az'
peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
G_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
L_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
def generate_questions(context, answer):
    device = next(L_model.parameters()).device
    input_text = f"Given the context '{context}' and the answer '{answer}', what question can be asked?"
    encoding = G_tokenizer.encode_plus(input_text, padding=True, return_tensors="pt").to(device)

    output_tokens = L_model.generate(
        **encoding,
        early_stopping=True,
        do_sample= True,
        num_beams=5,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        max_length=256,
        temperature=0.6,
        top_p=0.95,
        repetition_penalty=1.2
    )
    question = G_tokenizer.decode(output_tokens[0], skip_special_tokens=True).replace("question :", "").strip()
    return question

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from fuzzywuzzy import fuzz
from bert_score import score


def calculate_bleu_scores(references, predictions):
    return corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

def calculate_rouge_scores(references, predictions):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(predictions, references, avg=True)
    return rouge_scores

def calculate_accuracy(references, predictions):
    accuracies = [fuzz.token_sort_ratio(ref, pred) / 100.0 for ref, pred in zip(references, predictions)]
    return sum(accuracies) / len(accuracies)

def calculate_bert_score(references, predictions):
    P, R, F1 = score(predictions, references, lang='en', verbose=False)
    return F1.mean().item()

def evaluate(dataset):
    references = [sample['question'] for sample in dataset]
    predictions = [generate_questions(sample['context'], sample['answer']) for sample in dataset]  # Assuming 'generate_questions' generates model's output

    bleu_score = calculate_bleu_scores(references, predictions)
    rouge_scores = calculate_rouge_scores(references, predictions)
    accuracy = calculate_accuracy(references, predictions)
    bert_score = calculate_bert_score(references, predictions)

    print("Overall Accuracy:", accuracy)
    print("Overall BLEU Score:", bleu_score)
    print("Overall ROUGE Score:", rouge_scores)
    print("Overall BERTScore:", bert_score)

    return accuracy, bleu_score, rouge_scores, bert_score

# Assuming 'en_validation' is your dataset
accuracy, bleu_score, rouge_scores, bert_score = evaluate(en_validation)

# ###########################################