### Install Dependencies

In [None]:
!pip install -q -U datasets
!pip install -q -U accelerate
!pip install -q -U sentencepiece
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/boudinfl/pke.git
!pip install nltk fuzzywuzzy python-Levenshtein
!python -m spacy download en_core_web_sm
!pip install nltk==3.5.0
!pip install sense2vec==2.0.1
!wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
!tar -xvf  s2v_reddit_2015_md.tar.gz
!pip install keybert
!pip install keybert[flair]
!pip install keybert[gensim]
!pip install keybert[spacy]
!pip install keybert[use]
!pip install python-pptx
!pip install python-docx
!pip install openpyxl
!pip install rarfile
!pip install PyPDF2
!pip install xlrd

In [None]:
import os
import torch

os.environ["WANDB_DISABLED"] = "true"
if torch.cuda.is_available()==True :
    os.environ["CUDA_VISIBLE_DEVICES"]="0"

### dealing with the data

In [None]:
import PyPDF2
import requests
from bs4 import BeautifulSoup
import csv
import zipfile
import json
import xml.etree.ElementTree as ET
import docx
import pptx
import rarfile
import xlrd  # For XLS files
import openpyxl  # For XLSX files
import shutil  # For extracting ZIP files

def read_csv(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        csv_reader = csv.reader(csvfile)
        csv_data = [row for row in csv_reader]
    return csv_data

def read_text(file_path):
    with open(file_path, 'r') as f:
        text_data = f.read()
    return text_data

def read_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text_data = []
    for page in pdf_reader.pages:
        text_data.append(page.extract_text())
    return text_data

def read_web_page(url):
    result = requests.get(url)
    src = result.content
    soup = BeautifulSoup(src, 'html.parser')
    text_data = ''
    for p in soup.find_all('p'):
        text_data += p.get_text() + '\n'
    return text_data

def read_docx(file_path):
    doc = docx.Document(file_path)
    text_data = '\n'.join([paragraph.text for paragraph in doc.paragraphs])
    return text_data

def read_pptx(file_path):
    ppt = pptx.Presentation(file_path)
    text_data = ''
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text_data += shape.text + '\n'
    return text_data

def read_xlsx(file_path):
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    text_data = ''
    for row in sheet.iter_rows(values_only=True):
        text_data += ' '.join([str(cell) for cell in row if cell is not None]) + '\n'
    return text_data


def read_json(file_path):
    with open(file_path, 'r') as f:
        json_data = json.load(f)
    return json_data

def read_html(file_path):
    with open(file_path, 'r') as f:
        html_data = f.read()
    return html_data

def read_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return ET.tostring(root, encoding='unicode')

def read_zip(file_path):
    file_contents = []
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            with zip_ref.open(file_info) as file:
                # Call read_data to handle reading and processing the file contents
                file_data = read_data(file)
                file_contents.append(file_data)
    return file_contents

def read_rar(file_path):
    file_contents = []
    with rarfile.RarFile(file_path, 'r') as rar_ref:
        for rar_info in rar_ref.infolist():
            with rar_ref.open(rar_info) as file:
                # Call read_data to handle reading and processing the file contents
                file_data = read_data(file)
                file_contents.append(file_data)
    return file_contents


def read_data(file_path):
    # Check if the file is a CSV file
    if file_path.endswith('.csv'):
        return read_csv(file_path)

    # Check if the file is a text file
    elif file_path.endswith('.txt'):
        return read_text(file_path)

    # Check if the file is a PDF file
    elif file_path.endswith('.pdf'):
        return read_pdf(file_path)

    # Check if the file is a DOCX file
    elif file_path.endswith('.docx'):
        return read_docx(file_path)

    # Check if the file is a PPTX file
    elif file_path.endswith('.pptx'):
        return read_pptx(file_path)

    # Check if the file is an XLSX file
    elif file_path.endswith('.xlsx'):
        return read_xlsx(file_path)

    # Check if the file is a JSON file
    elif file_path.endswith('.json'):
        return read_json(file_path)

    # Check if the file is an HTML file
    elif file_path.endswith('.html'):
        return read_html(file_path)

    # Check if the file is an XML file
    elif file_path.endswith('.xml'):
        return read_xml(file_path)

    # Check if the file is a ZIP file
    elif file_path.endswith('.zip'):
        return read_zip(file_path)

    # Check if the file is a RAR file
    elif file_path.endswith('.rar'):
        return read_rar(file_path)

    # # Assume it's a web page if it's not a file
    # elif os.path.exists(file_path):
    #     return read_text(file_path)

    # Treat it as a web page if it's a URL
    elif file_path.startswith('http'):
        return read_web_page(file_path)

    # If the file type is unknown, return None
    else:
        print("Unsupported file type")
        return None

# Example usage:
file_path = 'https://huggingface.co/docs/transformers/quicktour'
example = read_data(file_path)
example

### Cleaning the data

In [None]:
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
stop_words = stopwords.words('english')
# arabic_stopwords = stopwords.words('arabic')

In [None]:
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return text.encode('ascii','ignore').decode()
    #return ''.join(char for char in text if char.isalpha() and char.isnumeric() or 'ARABIC' in unicodedata.name(char, ''))

def remove_brackets_num(text):
    return re.sub("\*?","",text)

def to_lowercase(text):
    return text.lower()

def replace_numbers(text):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    return re.sub(r'\d+','',text)

def remove_whitespace(text):
      return text.strip()

def remove_punctuation(text):
    punctuation= '''!()[]{};:'"\<>/?$%^&*_`~='''
    for punc in punctuation:
        text=text.replace(punc,"")
    return text

def remove_emails(text):
    return re.sub(r'[A-Za-z0-9]*@[A-Za-z]*\.?[A-Za-z0-9]*', "", text)

def text2words(text):
    return word_tokenize(text)

def remove_stopwords(words,stop_words):
    return [word for word in words if word not in stop_words]


def normalize_text(text):
    text = remove_non_ascii(text)
    text= remove_brackets_num(text)
    text = to_lowercase(text)
    #text=replace_numbers(text)
    text= remove_whitespace(text)
    text = remove_punctuation(text)
    text= remove_emails(text)
    words = text2words(text)
    #words = remove_stopwords(words, stop_words)

    return ' '.join(words)

N_text_file=normalize_text(example)

def divide_text_into_chunks(text, chunk_size):
    chunks = []
    if len(text) <= chunk_size:
        return [text]  # Return the whole text as a single chunk
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks

chunks=divide_text_into_chunks(N_text_file,1024)

### Generate KeyWords

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [None]:
def jaccard_similarity(word1, word2):
    set1 = set(word1)
    set2 = set(word2)

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    similarity = intersection / union if union > 0 else 0
    return similarity
def extract_keywords_from_chunks(chunks):
    chunk_keywords_list = []
    all_keywords = set()  # Set to store all keywords seen so far
    similarity_threshold_within_chunk = 0.4  # Threshold for similarity within a chunk
    similarity_threshold_between_chunks = 0.4  # Threshold for similarity between chunks

    for chunk in chunks:
        # Extract keywords from the current chunk
        KeyBERT1 = kw_model.extract_keywords(chunk, keyphrase_ngram_range=(1,1), top_n=1)
        KeyBERT2 = kw_model.extract_keywords(chunk, keyphrase_ngram_range=(2,2), top_n=1)
        KeyBERT3 = kw_model.extract_keywords(chunk, keyphrase_ngram_range=(3,3), top_n=1)

        # Combine all extracted keywords
        chunk_keywords = [key[0] for key in KeyBERT1] + \
                         [key[0] for key in KeyBERT2] + \
                         [key[0] for key in KeyBERT3]
        # Filter out empty chunks and corresponding keywords
        if chunk_keywords:
            # Filter out very similar keywords within the current chunk
            unique_keywords_within_chunk = []
            for keyword in chunk_keywords:
                if all(jaccard_similarity(keyword, existing_keyword) < similarity_threshold_within_chunk for existing_keyword in unique_keywords_within_chunk):
                    unique_keywords_within_chunk.append(keyword)

            # Filter out very similar keywords between the current chunk and previously processed chunks
            unique_keywords_between_chunks = []
            for keyword in unique_keywords_within_chunk:
                if all(jaccard_similarity(keyword, existing_keyword) < similarity_threshold_between_chunks for _, existing_keywords in chunk_keywords_list for existing_keyword in existing_keywords):
                    unique_keywords_between_chunks.append(keyword)

            # Check if there are any keywords left after filtering
            if unique_keywords_between_chunks:
                chunk_keywords_list.append((chunk, unique_keywords_between_chunks))
                all_keywords.update(unique_keywords_between_chunks)

    return chunk_keywords_list

chunks_with_keywords = extract_keywords_from_chunks(chunks)

### Generate Distractors

In [None]:
from sense2vec import Sense2Vec
# load sense2vec vectors
s2v = Sense2Vec().from_disk('s2v_old')

In [None]:
from collections import OrderedDict

def sense2vec_get_words(word, s2v):
    output = []
    word = word.lower()
    word = word.replace(" ", "_")

    sense = s2v.get_best_sense(word)
    out = []
    if sense is not None:
        most_similar = s2v.most_similar(sense, n=20)

        for each_word in most_similar:
            append_word = each_word[0].split("|")[0].replace("_", " ").lower()

            # Check similarity and exclude if too similar
            similarity_threshold = 0.4
            if append_word != word and jaccard_similarity(word, append_word) < similarity_threshold:
                output.append(append_word.title())

        out = list(OrderedDict.fromkeys(output))
    return out[:3]

chunk_keyword_distractor_list = []
for chunk, keywords in chunks_with_keywords:
    # Generate distractors for each keyword
    result = {word: sense2vec_get_words(word, s2v) for word in keywords}

    # Filter out keywords without distractors or with fewer than three distractors
    filtered_keywords = []
    for word, distractors in result.items():
        if distractors and len(distractors) >= 3:  # Check if distractors list contains at least three words
            filtered_keywords.append(word)

    # If there are keywords with filtered distractors, add them to the list
    if filtered_keywords:
        chunk_keyword_distractor_list.append((chunk, filtered_keywords, {word: result[word] for word in filtered_keywords}))

# Output the list of tuples
for chunk_data in chunk_keyword_distractor_list:
    print("Chunk:", chunk_data[0])
    print("Keywords:", chunk_data[1])
    print("Distractors:")
    for keyword, distractors in chunk_data[2].items():
        print(f"  Distractors for {keyword}: {distractors}")
    print()

### loading the model and the tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name_or_path = "facebook/bart-base"
G_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float32,
    device_map='auto',
)
G_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

In [None]:
# Freeze all layers in the encoder
for layer in G_model.model.encoder.layers:
    for parameter in layer.parameters():
        parameter.requires_grad = False

# Freeze all layers in the decoder
for layer in G_model.model.decoder.layers:
    for parameter in layer.parameters():
        parameter.requires_grad = False

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
Lora_config = LoraConfig(
    r=18,
    lora_alpha=8,
    target_modules=["v_proj" , "q_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

L_model = get_peft_model(G_model, Lora_config)
print(L_model.print_trainable_parameters())

In [None]:
from datasets import Dataset, load_dataset, concatenate_datasets

#for General english data
en_g_train = (
    load_dataset("squad_v2", split="train")
    .filter(lambda example: len(example["answers"]["text"]) > 0 and example["context"] and example["question"])
    .shuffle(seed=42)
    .select(range(7300))
    .map(lambda example: {"context": example["context"], "question": example["question"], "answer": example["answers"]["text"][0]})
    .remove_columns(["id", "title", "answers"])
)
en_g_validation = (
    load_dataset("squad_v2", split="validation")
    .filter(lambda example: len(example["answers"]["text"]) > 0 and example["context"] and example["question"])
    .shuffle(seed=42)
    .select(range(382))
    .map(lambda example: {"context": example["context"], "question": example["question"], "answer": example["answers"]["text"][0]})
    .remove_columns(["id", "title", "answers"])
)

#for diff english data
en_d_train = (
    load_dataset("drop", split="train")
    .filter(lambda example: len(example["answers_spans"]["spans"]) > 0 and example["passage"] and example["question"])
    .shuffle(seed=42)
    .select(range(7300))
    .map(lambda example: {"context": example["passage"], "question": example["question"], "answer": example["answers_spans"]["spans"][0]})
    .remove_columns(["section_id", "query_id", "answers_spans", "passage"])
)
en_d_validation = (
    load_dataset("drop", split="validation")
    .filter(lambda example: len(example["answers_spans"]["spans"]) > 0 and example["passage"] and example["question"])
    .shuffle(seed=42)
    .select(range(383))
    .map(lambda example: {"context": example["passage"], "question": example["question"], "answer": example["answers_spans"]["spans"][0]})
    .remove_columns(["section_id", "query_id", "answers_spans", "passage"])
)

# #for IT english data
en_it_train = (
    load_dataset("mou3az/IT_QA-QG", split="train")
    .filter(lambda example: len(example["answer"]) > 0 and example["context"] and example["question"])
    .shuffle(seed=42)
    # .select(range(20000))
    .remove_columns(["id"])
)
en_it_validation = (
    load_dataset("mou3az/IT_QA-QG", split="validation")
    .filter(lambda example: len(example["answer"]) > 0 and example["context"] and example["question"])
    .shuffle(seed=42)
    # .select(range(1000))
    .remove_columns(["id"])
)

# # Mix the datasets
mixed_train = concatenate_datasets([en_g_train, en_it_train, en_d_train])

# Shuffle the mixed dataset
en_train = mixed_train.shuffle(seed=123)

# Mix the datasets
mixed_validation = concatenate_datasets([en_g_validation, en_it_validation, en_d_validation])

# Shuffle the mixed dataset
en_validation = mixed_validation.shuffle(seed=123)

In [None]:
#For General data
def create_prompt1(context, answer):
    input_text = f"Given the context '{context}' and the answer '{answer}', what question can be asked?"
    return input_text

def create_prompt2(question):
    output_text = f"question: {question}"
    return output_text

In [None]:
#for english data
en_train_data = en_train.map(lambda samples: G_tokenizer.encode_plus(create_prompt1(samples['context'], samples['answer']), padding=True, truncation=True), remove_columns=["context", "answer", "question"])
en_validation_data = en_validation.map(lambda samples: G_tokenizer.encode_plus(create_prompt1(samples['context'], samples['answer']), padding=True, truncation=True), remove_columns=["context", "answer", "question"])
en_question_Tdata = en_train.map(lambda samples: G_tokenizer.encode_plus(create_prompt2(samples['question']), padding=True, truncation=True), remove_columns=["context", "answer", "question"])["input_ids"]
en_question_Vdata = en_validation.map(lambda samples: G_tokenizer.encode_plus(create_prompt2(samples['question']), padding=True, truncation=True), remove_columns=["context", "answer", "question"])["input_ids"]
en_train_data=en_train_data.add_column("labels", en_question_Tdata)
en_validation_data=en_validation_data.add_column("labels", en_question_Vdata)

In [None]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
      gradient_accumulation_steps=24,
      per_device_train_batch_size=8,
      per_device_eval_batch_size=8,
      # save_steps=2,
      eval_steps=1,
      warmup_steps=50,
      logging_steps=1,
      weight_decay=0.05,
      # save_total_limit=5,
      learning_rate=3e-3,
      max_steps=1000,
      # num_train_epochs=2,
      # load_best_model_at_end=True,
      # gradient_checkpointing=True,
      lr_scheduler_type="linear",
      do_train=True,
      do_eval=True,
      # fp16=False,
      report_to="all",
      log_level="debug",
      logging_dir='./logs',
      output_dir='./outputs',
      label_names=["labels"],
      evaluation_strategy="steps",
      metric_for_best_model="eval_loss",
    )

trainer = Seq2SeqTrainer(
    model=L_model,
    args=training_args,
    tokenizer=G_tokenizer,
    train_dataset=en_train_data,
    eval_dataset=en_validation_data,
    # callbacks=[EarlyStoppingCallback(2, 1.0)],
    data_collator=DataCollatorForSeq2Seq(G_tokenizer,label_pad_token_id=-100),
)

# Additional configuration
L_model.config.use_cache = False
torch.cuda.empty_cache()

# Start training
trainer.train()

In [None]:
# to hugging face
model_name = ""
HUGGING_FACE_USER_NAME = ""

G_model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", token='')

In [None]:
# Save model checkpoint
L_model.save_pretrained("")
# Create a zip archive
!zip -r saved_model.zip 
# Create a rar archive
!rar a saved_model.rar 

In [None]:
#general question generation model
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
HUGGING_FACE_USER_NAME='mou3az'
model_name='IT-General_Question-Generation'
peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
G_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
G_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge
from fuzzywuzzy import fuzz

def calculate_bleu_scores(references, predictions):
    return corpus_bleu([[ref.split()] for ref in references], [pred.split() for pred in predictions])

def calculate_rouge_scores(references, predictions):
    rouge = Rouge()
    rouge_scores = rouge.get_scores(predictions, references, avg=True)
    return rouge_scores['rouge-l']['f']

def calculate_accuracy(references, predictions):
    accuracies = [fuzz.token_sort_ratio(ref, pred) / 100.0 for ref, pred in zip(references, predictions)]
    return sum(accuracies) / len(accuracies)

def evaluate(dataset):
    references = [sample['question'] for sample in dataset]
    predictions = [get_question(sample['context'], sample['answer']) for sample in dataset]  # Assuming 'get_question' generates model's output

    bleu_score = calculate_bleu_scores(references, predictions)
    rouge_score = calculate_rouge_scores(references, predictions)
    accuracy = calculate_accuracy(references, predictions)

    print("Overall Accuracy:", accuracy)
    print("Overall BLEU Score:", bleu_score)
    print("Overall ROUGE Score:", rouge_score)

# Assuming 'en_validation' is your dataset
evaluate(en_validation)

In [None]:
def generate_questions_with_distractors(chunk_keyword_distractor_list):
    device = next(G_model.parameters()).device
    all_questions = []
    for chunk, keywords, distractors_dict in chunk_keyword_distractor_list:
        for keyword in keywords:
            distractors = distractors_dict.get(keyword, [])
            input_text = f"Given the context '{context}' and the answer '{answer}', what question can be asked?"
            encoding = G_tokenizer.encode_plus(input_text, padding=True, return_tensors="pt").to(device)

            output_tokens = G_model.generate(**encoding, early_stopping=True, num_beams=5, num_return_sequences=1, no_repeat_ngram_size=2, max_length=100)
            question = G_tokenizer.decode(output_tokens[0], skip_special_tokens=True).replace("question:", "").strip()
            all_questions.append((chunk, keyword, question, distractors))
    return all_questions
Total_List= generate_questions_with_distractors(chunk_keyword_distractor_list)

# Print the questions
for chunk,keyword, question, distractors in Total_List:
    print(f"chunk: {chunk}")
    print(f"Keyword: {keyword}")
    print(f"Question: {question}")
    print(f"Distractors: {distractors}")
    print()