# **Importing Files**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Fetching PDF's through API**

In [1]:
!pip install feedparser
!pip install PyPDF2
import requests
import feedparser
from PyPDF2 import PdfReader
import io

def search_arxiv(query, max_results=50):
    base_url = "http://export.arxiv.org/api/query?"
    search_query = f"search_query={query}&max_results={max_results}"
    url = base_url + search_query
    response = requests.get(url)
    feed = feedparser.parse(response.content)
    return feed.entries

def extract_text_from_pdf(pdf_url):
    response = requests.get(pdf_url)
    pdf_file = io.BytesIO(response.content)
    
    reader = PdfReader(pdf_file)
    total_text = ""
    total_pages = len(reader.pages)
    
    for i in range(total_pages):
        page = reader.pages[i]
        text = page.extract_text()
        if text:
            total_text += text + " "
    
    return total_text

def get_combined_text_for_nlp_papers():
    query = "nlp"  # You can adjust this query as per your requirements
    nlp_papers = search_arxiv(query, max_results=50)
    
    combined_text = ""
    
    for paper in nlp_papers:
        pdf_url = paper.get("pdf_url", paper.link.replace('abs', 'pdf')) + ".pdf"
        print(f"Fetching and extracting from: {paper.title}")
        
        try:
            paper_text = extract_text_from_pdf(pdf_url)
            combined_text += paper_text + " "
        except Exception as e:
            print(f"Error processing {pdf_url}: {e}")
    
    return combined_text

combined_nlp_text = get_combined_text_for_nlp_papers()
print(f"Extracted combined text length: {len(combined_nlp_text)}")

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hDownloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25ldone
[?25h  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6049 sha256=9e5997bf522e351b41a069f71f94e18657311786aeae19f37c7fa75ffba65036
  Stored in directory: /root/.cache/pip/wheels/f0/69/93/a47e9d621be168e9e33c7ce60524393c0b92ae83cf6c6e89c5
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)


In [2]:
total_text = combined_nlp_text
print(len(total_text))

3956343


In [3]:
total_text = total_text.replace('"', '')
total_text = total_text.replace("{", '')
total_text = total_text.replace("}", '')
print(len(total_text))

3954565


In [4]:
chunk_size = 150
total_text_array = total_text.split()
chunks = []

chunk = ""
for i in range(0, len(total_text_array)):
  chunk = chunk + total_text_array[i] + " "
  if(i != 0 and (i % chunk_size == 0 or i == len(total_text_array) - 1)):
    chunks.append(chunk)
    chunk = ""

In [5]:
print(len(chunks))

3926


In [7]:
import json
with open('/kaggle/working/chunks.json', 'w') as f:
    json.dump(chunks, f)

# **Generating dataset using Gemma model**

In [None]:
#General format
{
  'id': 'seed_task_0', 
  'type': 'general', 
  'question': 'What is the official motto of the United States of America?', 
  'context': {
    'sentences': [
      ["the Gulf of Mexico are prone to hurricanes, ... and enforces the Act. [ 189 ] As of 2022, the U. S",
    "energy from fossil fuel and the largest ... there are 19, 969 airports in the U. S., of which 5, 193 are designated",
    'weaponry, ideology, and international i... and is a permanent member of the UN Security Council. The first documentary evidence of the phrase " United States',
    '[CLS] United States of America Flag Coat of arms ... dominance in nuclear and conventional',
    '##om ic soft pow er. [ 405 ] [ 406 ] Nearly all present ... rights in the United States are advanced by global standards.']
    ],
    'title': [
      ['placeholder_title',
      'placeholder_title',
      'placeholder_title',
      'placeholder_title',
      'placeholder_title']
    ]
  },
  'answer': '"In God We Trust"',
  'cot_answer': None
}

In [None]:
!pip install -q -U bitsandbytes
!!pip install -q -U accelerate
!pip install peft

In [None]:
from datetime import datetime
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
from peft import prepare_model_for_kbit_training,LoraConfig,PeftModel,get_peft_model
from datasets import load_dataset

In [None]:
model_path = "/kaggle/input/gemma/transformers/7b-it/3"
device = "cuda" # the device to load the model onto

# Quantization configuration

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
     #bnb_4bit_compute_dtype="float16",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Loading the model and tokenizer

model = AutoModelForCausalLM.from_pretrained(model_path,quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print(chunks[40])

In [None]:
question_prompt = (
    "Based on the following text, generate exactly 5 short and relevant questions regarding a fact in the text. "
    "The output should be a valid JSON object with 5 key-value pairs, where each key is 'question_X' (X being the number 1 to 5) "
    "and each value is the corresponding question. Return only the JSON object in the following format:\n"
    "{\n\"question_1\": \"<question 1>\",\n\"question_2\": \"<question 2>\",\n\"question_3\": \"<question 3>\",\n"
    "\"question_4\": \"<question 4>\",\n\"question_5\": \"<question 5>\"\n} Do not repeat or reference any part of the input text in your response, and only output the JSON. Stick to this format and strictly do not write or output anything else. Your response should just be a json object and no other text. If a text is in double quotes then put backslash to include it into json\n\n"
    + chunks[0] + "."
)
question_inputs = tokenizer(question_prompt, return_tensors="pt", padding=True, truncation=True, max_length=10000).to(device)
attention_mask = question_inputs["attention_mask"].to(device)

outputs = model.generate(question_inputs["input_ids"], attention_mask=attention_mask, max_length=10000, pad_token_id=tokenizer.eos_token_id)
question = tokenizer.decode(outputs[0], skip_special_tokens=True)

start_idx = question.find('{', question.find('{') + 1)
end_idx = question.find('}', start_idx + 1)

question_json = question[start_idx: end_idx + 1]
print(question_json)

In [None]:
answer_prompt = (
    "Based on the following text, generate exactly 5 relavant answers for the following questions regarding a fact in the text. "
    "The output should be a valid JSON object with 5 key-value pairs, where each key is 'answer_X' (X being the number 1 to 5) "
    "and each value is the corresponding answer. Return only the JSON object in the following format:\n"
    "{\n\"answer_1\": \"<answer 1>\",\n\"answer_2\": \"<answer 2>\",\n\"answer_3\": \"<answer 3>\",\n"
    "\"answer_4\": \"<answer 4>\",\n\"answer_5\": \"<answer 5>\"\n} Do not repeat or reference any part of the input text in your response, and only output the JSON. Stick to this format and strictly do not write or output anything else. Your response should just be a json object and no other text\n\n"
    + chunks[0] + "." + "\n\n" + question_json + "."
)
answer_inputs = tokenizer(answer_prompt, return_tensors="pt", padding=True, truncation=True, max_length=10000).to(device)
attention_mask = answer_inputs["attention_mask"].to(device)

outputs = model.generate(answer_inputs["input_ids"], attention_mask=attention_mask, max_length=10000, pad_token_id=tokenizer.eos_token_id)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

start_idx = answer.find('{', answer.find('{', answer.find('{') + 1) + 1)
end_idx = answer.find('}', start_idx + 1)

answer_json = answer[start_idx: end_idx + 1]
print(answer_json)

In [None]:
import random
import json

In [None]:
print(len(chunks) / 20)

In [None]:
seed = 0
datapoints = []
for i,chunk in enumerate(chunks[:197]):
    print(f"Chunk: {i}")
    #Creating a dictionary of 5 questions
    question_prompt = (
    "Based on the following text, generate exactly 5 short and relevant questions regarding a fact in the text. "
    "The output should be a valid JSON object with 5 key-value pairs, where each key is 'question_X' (X being the number 1 to 5) "
    "and each value is the corresponding question. Return only the JSON object in the following format:\n"
    "{\n\"question_1\": \"<question 1>\",\n\"question_2\": \"<question 2>\",\n\"question_3\": \"<question 3>\",\n"
    "\"question_4\": \"<question 4>\",\n\"question_5\": \"<question 5>\"\n} Do not repeat or reference any part of the input text in your response, and only output the JSON. Stick to this format and strictly do not write or output anything else. Your response should just be a json object and no other text\n\n"
    + chunk + "."
    )
    question_inputs = tokenizer(question_prompt, return_tensors="pt", padding=True, truncation=True, max_length=4000).to(device)
    attention_mask = question_inputs["attention_mask"].to(device)

    outputs = model.generate(question_inputs["input_ids"], attention_mask=attention_mask, max_length=4000, pad_token_id=tokenizer.eos_token_id)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)

    start_idx = question.find('{', question.find('{') + 1)
    end_idx = question.find('}', start_idx + 1)
    questions = question[start_idx: end_idx + 1]
    
    try:
        questions_json = json.loads(questions)
    except json.JSONDecodeError:
        print(f"Skipping chunk {i} due to JSONDecodeError in question generation.")
        continue 
    
    #Creating a dictionary of 5 answers
    answer_prompt = (
    "Based on the following text, generate exactly 5 relavant answers for the following questions regarding a fact in the text. "
    "The output should be a valid JSON object with 5 key-value pairs, where each key is 'answer_X' (X being the number 1 to 5) "
    "and each value is the corresponding answer. Return only the JSON object in the following format:\n"
    "{\n\"answer_1\": \"<answer 1>\",\n\"answer_2\": \"<answer 2>\",\n\"answer_3\": \"<answer 3>\",\n"
    "\"answer_4\": \"<answer 4>\",\n\"answer_5\": \"<answer 5>\"\n} Do not repeat or reference any part of the input text in your response, and only output the JSON. Stick to this format and strictly do not write or output anything else. Your response should just be a json object and no other text\n\n"
    + chunk + "." + "\n\n" + questions + "."
    )
    answer_inputs = tokenizer(answer_prompt, return_tensors="pt", padding=True, truncation=True, max_length=4000).to(device)
    attention_mask = answer_inputs["attention_mask"].to(device)

    outputs = model.generate(answer_inputs["input_ids"], attention_mask=attention_mask, max_length=4000, pad_token_id=tokenizer.eos_token_id)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    start_idx = answer.find('{', answer.find('{', answer.find('{') + 1) + 1)
    end_idx = answer.find('}', start_idx + 1)
    answers = answer[start_idx: end_idx + 1]
    
    try:
        answers_json = json.loads(answers)
    except json.JSONDecodeError:
        print(f"Skipping chunk {i} due to JSONDecodeError in question generation.")
        continue 
        
    #Creating a datapoint for each question-answer pair
    for i, j in zip(questions_json, answers_json):
        random_chunks = random.choices(chunks, k = 4)
        random_chunks.append(chunk)
        datapoint = {
            'id': 'seed_task_' + str(seed),
            'type': 'general',
            'question': questions_json[i],
            'context': {
                'sentences' : [
                    random_chunks
                ],
                'title': [
                    ['placeholder_title',
                    'placeholder_title',
                    'placeholder_title',
                    'placeholder_title',
                    'placeholder_title']
                ]
            },
            'answer': answers_json[j],
            'cot_answer': None
        }
        datapoints.append(datapoint)
        seed = seed + 1

In [None]:
print(len(datapoints))

In [None]:
with open('nlp-197.json', 'w') as f:
    json.dump(datapoints, f)