In [None]:
!pip install -q auto-gptq==0.4.2  tika aspose-words accelerate
!apt-get install -y unoconv libreoffice
# !pip install "git+https://github.com/huggingface/transformers.git"
!pip install transformers==4.35.2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
unoconv is already the newest version (0.7-2ubuntu1).
libreoffice is already the newest version (1:7.3.7-0ubuntu0.22.04.4).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import os
import json
import requests
from tika import parser
import re
from datetime import date
from datetime import datetime
import time
from huggingface_hub import hf_hub_download
import aspose.words as aw
from io import StringIO
from bs4 import BeautifulSoup
import ast
import itertools
import random
import subprocess

In [None]:
# Here, we are using LLaMa-2-13b 4 bit quantized model for question generation

local_folder = "/content/drive/MyDrive/LLaMa-2-13b-chat-GPTQ-4bit"
tokenizer = AutoTokenizer.from_pretrained(local_folder, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(local_folder,
        model_basename="model",
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        use_triton=False,
        quantize_config=None)



In [None]:
def mcq(Text):
    ''' Returns multiple choice questions with corresponding options , answer, page_number and rank

    Parameters:
    Text(list): contain list of all required pages

    Returns:
    final mcq (list): contain list of dictionaries (each dictionary has each question, options, answer, page_number)

    Sample_output: [{'question': 'Who is the Prime minister of India?',
                     'options': ['Pakistan', 'Nepal', 'India', 'China'],
                     'answer': 'India',
                     'page_no':int}] '''
    mcq_by_LLM = []
    for index,item in enumerate(Text):
        # check page length means text length in particular page
        if len(item)>100:

            # We will generate all possible mcq from single page in json format
            prompt = "Generate as many as possible hard Multiple choice questions with four options and answer using this text:"+" "+item+'''\n Give me output in this JSON array format:[{"question": string, "options":List[string], "answer":string}]'''
            prompt_template=f'''[INST] <<SYS>>
            You are best in generating Multiple choice questions with demanded JSON format.
            <</SYS>>
            {prompt}[/INST]'''

            input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
            output = model.generate(inputs=input_ids, temperature=0.1, top_p=0.9, max_new_tokens=768)
            op=tokenizer.decode(output[0])
            llm_output=op.split('[/INST]')[1].strip()

            # once we have all generated mcq from single page we will rank them as per their quality
            prompt = llm_output+" \n for above text assign ranking score based on the quality of question to each question in terms of percentage varying from o% to 100%, you can use this text from which questions are generated as a reference, text:"+" "+item+'''\n Give me output in this JSON array format:[{"question": string, "options":List[string], "answer":string, "rank":string}]'''
            prompt_template=f'''[INST] <<SYS>>
            You are best in ranking Multiple choice questions based on quality of question. Generate output in demanded JSON format.
            <</SYS>>
            {prompt}[/INST]'''

            input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
            output = model.generate(inputs=input_ids, temperature=0.1, top_p=0.9, max_new_tokens=768)
            op=tokenizer.decode(output[0])
            llm_output_2=op.split('[/INST]')[1].strip()
            mcq_by_LLM.append(llm_output_2)

        # If page has short text length than will add below strings so that we will get exact page number
        else:
            mcq_by_LLM.append('Questions are not generated due to shorter length')

    def extract_data(entry):
        # This function will return dictionary with all required keys
        question = entry.get("question", None)
        options = entry.get("options", None)
        answer = entry.get("answer", None)
        rank= entry.get("rank", None)
        return {"question": question, "options": options, "answer": answer, "rank":rank}

    def process_data(data_string):
        ''' Returns list with dictionaries as element

        Parameters:
        data_string(str): raw output from LLM

        Returns:
        question_dicts(list): contain list of dictionaries (each dictionary has each question, options, answer, rank)
        '''
        # Enclose keys and values in double quotes
        data_string = re.sub(r'([{,])\s?([a-zA-Z_]+[a-zA-Z0-9_]*)\s?:', r'\1"\2":', data_string)
        data_string = re.sub(r'"(.*?)"', r'"\1"', data_string)

        # Add double quotes around rank values
        data_string = re.sub(r'"rank"\s?:\s?(\d+)%', r'"rank":"\1%"', data_string)
        data_string = re.sub(r'"rank"\s?:\s?(\d+%?)', r'"rank":"\1"', data_string)
        data_string = re.sub(r'"rank"\s?:\s?"?(\d+)%"?', r'"rank":"\1"', data_string)

        data_list = []
        # using regex we will find all entries which enclosed in {}
        entries = re.findall(r"{.*?}", data_string)
        for entry in entries:
            try:
                # this line will convert all dictionary in valid json format
                data_list.append(eval(entry))
            except Exception as e:
                pass

        # once we have all required keys and values for mcq question type we will get in list format
        question_dicts = [extract_data(entry) for entry in data_list if all(key in entry and entry[key] for key in ["question", "options", "answer", "rank"])]
        return question_dicts

    # Here, we will take raw output of LLM and then clean it and apply above functions to get final_generated_questions
    final_generated_questions = []
    for index,data in enumerate(mcq_by_LLM):
        llm_output = data.replace('\\', '').replace('\\n', '')   # replace all \n
        llm_output = ' '.join(llm_output.split())
        # Use above process data function to extract dictionary elements from string output of LLM
        data_updated=process_data(llm_output)

        for item in data_updated:
            if file_type == "mp4":
                item['context']=final[index]
            else:
                item['context']=None
            item['page_no']=int(index+1)
            item['statement']=None
            item['question_type']="mcq"
            if type(item['options']) == str:
                item['options'] = [option.capitalize() for option in item['options']]
            if type(item['answer']) == str:
                item['answer'] = item['answer'].capitalize()
            else:
                item['answer'] = item['answer']
        final_generated_questions.append(data_updated)
    final_generated_questions=flatten(final_generated_questions)

    # Apply modify_answer_based_on_input function to get cleaned options and answer
    final_generated_questions=modify_answer_based_on_input(final_generated_questions)

    # Sorting of final MCQ based on rank
    final_generated_questions=sorted(final_generated_questions, key=lambda x:(x['rank']), reverse=True)
    return final_generated_questions

In [None]:
final = ["""
This story is about Lencho, a dedicated farmer and how he places trust in God to help him out of his
misery. Lencho had hoped for a good harvest, but a hail storm destroyed his crops. He was
devastated, but he firmly believed that God would help him. He knew how to write, so he wrote a
letter to God, asking him to send 100 pesos and posted the letter.
The postman noticed the letter and pulled it out of the mailbox. Upon seeing whom it was addressed
to, he started laughing loudly. He ran to the postmaster to show him the strange letter. As the
postmaster read the contents of the letter, he became very serious. He decided to help Lencho
financially by asking for donations from the post office employees. The postmaster himself decided
to put a part of his salary into helping Lencho.
However, they could only raise 70 pesos and decide to put it in an envelope and sign it off in the
name of God. The following Sunday, Lencho visited the post office and asked if there was any letter
for him. The postmaster handed him the letter. Lencho did not get surprised seeing the money but
got dismayed upon counting it. He was sure that God could not make a mistake, so he took paper
and ink, wrote another letter to God, and put it in the mailbox.
"""]

In [None]:
request=requests.get("https://generate-questions.devbyopeneyes.com/api/getFileData/65e84280970073c3d008ab82")
resp = request.json()
file_name = (resp["data"]["file_name"])
_id = (resp["data"]["_id"])
file_type = (resp["data"]["file_type"])
user_file_name = (resp["data"]["file_name"])
type_of_question = (resp["data"]["type_of_question"])
if file_type == "mp4":
    pdf_file_path = (resp["data"]["pdf_file_path"])
    result = (resp["data"]["pdf_file_result"])
else:
    file_path = (resp["data"]["file_path"])

In [None]:
# Converts nested list to single list
def flatten(list_of_lists):
    return list(itertools.chain.from_iterable(list_of_lists))

In [None]:
def final_boolean_statement(true_statement,false_statement):
    ''' Returns list with dictionaries (questions) as element

    Parameters:
    true_statement(list): contain list of dictionaries with true answered statements
    false_statement(list): contain list of dictionaries with false answered statements

    Returns:
    for each page we will take half of true_statement and half of false_statement and combine them
    '''
    final_list=[]
    for list_1,list_2 in zip(true_statement,false_statement):
        first_half_list1 = list_1[:len(list_1) // 2]
        second_half_list2 = list_2[len(list_2) // 2:]
        final_list.append(first_half_list1 + second_half_list2)
    final_list=flatten(final_list)
    # we are shuffling questions so that we can display random required and additional questions
    random.shuffle(final_list)
    return final_list

def modify_answer_based_on_input(questions_list):
    ''' Returns questions list with modified answer and options

    Parameters:
    questions_list(list): We will check each dictionary's options and answer and modify it.'''
    try:
        for question in questions_list:
            answer = question['answer']
            options = question['options']

            # This condition will check if answer has onlhy a,b,c or d then will return answer as actual text string
            if type(answer)!=bool:
                if answer.lower() == 'a':
                    question['answer'] = options[0]
                elif answer.lower() == 'b':
                    question['answer'] = options[1]
                elif answer.lower() == 'c':
                    question['answer'] = options[2]
                elif answer.lower() == 'd':
                    question['answer'] = options[3]
            # This condition will check if options has A, B, C ,D or 1,2,3,4 in the beginning then it will remove it
            if len(options)==2:
                for i in range(2):
                    options[i] = options[i].replace(f'{chr(65 + i)})', '').replace(f'{chr(65 + i)}.', '').replace(f'{chr(97+ i)}.', '').replace(f'{chr(97 + i)})', '').replace(f'{i + 1})', '').replace(f'{i + 1}.', '').strip()
            if len(options)==3:
                for i in range(3):
                    options[i] = options[i].replace(f'{chr(65 + i)})', '').replace(f'{chr(65 + i)}.', '').replace(f'{chr(97+ i)}.', '').replace(f'{chr(97 + i)})', '').replace(f'{i + 1})', '').replace(f'{i + 1}.', '').strip()
            if len(options)==4:
                for i in range(4):
                    options[i] = options[i].replace(f'{chr(65 + i)})', '').replace(f'{chr(65 + i)}.', '').replace(f'{chr(97+ i)}.', '').replace(f'{chr(97 + i)})', '').replace(f'{i + 1})', '').replace(f'{i + 1}.', '').strip()

            # This condition will check if answer has A, B, C ,D or 1,2,3,4 in the beginning then it will remove it
            if type(question['answer']) != bool:
                question['answer'] = question['answer'].replace('A)', '').replace('a)', '').replace('A.', '').replace('a.', '').replace('1)', '').replace('1.', '').replace('B)', '').replace('b)', '').replace('B.', '').replace('b.', '').replace('2)', '').replace('2.', '').replace('C)', '').replace('c)', '').replace('C.', '').replace('c.', '').replace('3)', '').replace('3.', '').replace('D)', '').replace('d)', '').replace('D.', '').replace('d.', '').replace('4)', '').replace('4.', '').strip()
            # question['answer']=answer
        return questions_list
    except Exception as e:
        print(e)
        return questions_list

In [None]:
mcq_questions=mcq(final)

In [None]:
mcq_questions

[{'question': 'How much money did the postmaster and employees raise for Lencho?',
  'options': ['100 pesos', '70 pesos', '50 pesos', '20 pesos'],
  'answer': '70 pesos',
  'rank': '90',
  'context': None,
  'page_no': 1,
  'statement': None,
  'question_type': 'mcq'},
 {'question': 'Why did Lencho write a letter to God?',
  'options': ['He wanted to ask for money',
   'He wanted to complain about the hail storm',
   'He wanted to ask for a good harvest',
   'He wanted to ask for a new tractor'],
  'answer': 'He wanted to ask for a good harvest',
  'rank': '80',
  'context': None,
  'page_no': 1,
  'statement': None,
  'question_type': 'mcq'},
 {'question': 'What did Lencho do when he received the money from the postmaster?',
  'options': ['He was surprised and grateful',
   'He was dismayed and angry',
   'He did not get surprised and expected it',
   'He refused to accept it'],
  'answer': 'He was dismayed and angry',
  'rank': '70',
  'context': None,
  'page_no': 1,
  'statement': 

In [None]:
%pip install -U datasets==2.17.0

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    peft==0.3.0 --quiet

# Installing the Reinforcement Learning library directly from github.
%pip install git+https://github.com/lvwerra/trl.git@25fa1bd

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

In [None]:
!pip install -q datasets

In [None]:
from datasets import load_dataset
import torch

In [None]:
dataset_train=load_dataset("csv", data_files="/content/drive/MyDrive/AK/Data-Hindi-csv/hindi_train_dataset_2600.csv",split="train")
dataset_val=load_dataset("csv", data_files="/content/drive/MyDrive/AK/Data-Hindi-csv/hindi_val_dataset_393.csv",split="train")

# dataset_train = pd.read_excel("/content/drive/MyDrive/AK/Hindi/hindi_ds.xlsx")
# dataset_val = pd.read_excel("/content/drive/MyDrive/AK/Hindi/hindi_ds_val.xlsx")

In [None]:
dataset_train,dataset_val

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

# # model_name = "chintan4560/falcon-7b-sharded-bf16"
# # model_name = "AshishK/AK-openhathi-gptq-4bit"
# model_name = "shivarama23/OpenHathi-7B-Hi-v0.1-Base-sharded-bf16-1GB"

model_name = "AshishK/AK-OpenHathi-7B-Hi-Sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
    )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)
# model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 8

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()