In [None]:
!pip install accelerate
!pip install bitsandbytes

# LLM Question Answer Generation

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
import torch

model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
config.max_position_embeddings = 8096
quantization_config = BitsAndBytesConfig(
llm_int8_enable_fp32_cpu_offload=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.bfloat16,
load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
config=config,
trust_remote_code=True,
quantization_config=quantization_config,
device_map="cuda",
offload_folder="./offload"
)

In [None]:
information = """About Scotty

The Scottish terrier has long been a familiar figure around Carnegie Mellon's campus. For years students have suited up in an unofficial Scottish terrier costume to excite the fans at athletic events. But the relationship between the Scottish terrier breed and Carnegie Mellon far precedes anybody doing somersaults in a dog costume. Andrew Carnegie, founder of the university, kept a Scottish terrier as his pet.

Scotty's road from popular icon to official mascot of the university began in 2006. Carnegie Mellon formed a Mascot Identity Task Force in November 2006, which consisted of students, faculty, staff and alumni. The Task Force was co-chaired by Director of Athletics Susan Bassett and Dean of Student Affairs Jennifer Church.

The mascot selection process included a series of surveys and a university Town Hall meeting. Nearly 78 percent of 2,370 students surveyed in February 2007 voted for the Scottish terrier, and approximately 25 percent of 400 alumni surveyed thought the Scottish terrier was already the mascot.

In the spring, the Task Force partnered with SME Branding — a firm with more than 17 years of experience creating mascots for professional sports teams and universities — to develop the graphics for the mascot. During October, students and alumni reviewed potential mascot images in focus groups.

Carnegie Mellon's official mascot debuted at the Nov. 10, 2007 home football game. The graphic features a profile of a distinguished, bold Scottish terrier sporting a plaid scarf around his neck. The dog is contained in a shield, representing Carnegie Mellon's Scottish heritage.

The Task Force then partnered with a mascot costume company to design our Scottish terrier in the winter of 2007. The official Scotty costume was unveiled at the 2008 Spring Carnival."""


messages = [
    {"role": "user", "content": "You are an expert AI assisting us in creating a high quality, diverse synthetic dataset to train Information Retrieval models. Your role is to analyse the document chunk given to you and provide us with high quality potential queries and answers. Make sure answer are concise. \n\n Content:{}".format(information)},
    ]


encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")

model_inputs = encodeds.to("cuda")
generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

# Template questions

In [2]:
factoid_questions = ["Is {} the author for {}?", 
                     "Is {} the publication venue for {}",
                     "Was {} published?",
                     "Did {} help publish {}"]

descriptive_questions = ["Describe the contributions of the paper {}"]

reasoning_questions = ["If I had to find up and coming research on {}, which paper should I refer to?"]

opinion_questions = ["What is the best method for {}?"]
                     
numeric_questions = ["How many authors did the paper {} have?",
                     "Which author has the most publications for the year of 2023?"]

list_questions = ["What are the different types of language tasks for the papers published by LIT faculty?",
                  "List three papers that are about hallucations for LLMs?",
                  "What are the publication venues for the papers published by LIT faculty?",
                  "What are all the publication venues for papers written by author {}?"]

contextual_questions = ["What are the main findings of {}?",
                        "What are the key results of {}?"]

multi_turn_questions = ["What is a method proposed for {} and what makes it effective?",
                        "List {} unique publication venues for papers from LTI professors at CMU."]

comparision_questions = ["Compare {} versus {} for {}"]

temporal_questions = ["When was the paper {} published?"]

In [13]:
# load metadata
import pickle as pkl

with open(r"webscholar_lti_dict.pkl", "rb") as input_file:
    data = pkl.load(input_file)


    
f = open("Web Scholar PDFs/metadata.txt")
l = f.readlines()
f.close()

In [35]:
def createAnswer(question, answer, file, type):
    return "Q: {}\nA: {}\nD: {}\nT: {}\n\n".format(question, answer, file, type)

In [44]:
import random

k = list(data.keys())
random.shuffle(k)


authors = {}
for i in k:
    authors[i] = set()

titles = set()
venue = set()
paper_venue = {}

for i in data:
    for j in data[i]:
        paper = data[i][j]
    
        authors[i].add(paper["title"])
        titles.add(paper["title"])
        venue.add(paper["publicationVenue"])

        if paper["publicationVenue"] not in paper_venue:
            paper_venue[paper["publicationVenue"]] = set()
        
        paper_venue[paper["publicationVenue"]].add(paper["title"])

        for auth in paper["authors"]:
            if auth["name"] not in authors:
                authors[auth["name"]] = set()
            authors[auth["name"]].add(paper["title"])

In [41]:
questions = []

k = list(authors.keys())
k = random.sample(k, 5)


for i in k:
    seen = set()
    for j in range(3):

        randAuth = random.sample(list(authors.keys()), 1)

        if len(authors[i]) > 0:
            p1 = random.choice(list(authors[i]))

            if p1 in seen:
                pass
            else:
                seen.add(p1)
                q1 = createAnswer(factoid_questions[0].format(i, p1), "Yes.", "metadata.txt", "1")
                questions.append(q1)
            
        if len(authors[randAuth[0]]) > 0:
            p2 = random.choice(list(authors[randAuth[0]]))
            if p2 in seen:
                pass
            elif p2 in authors[i]:
                q2 = createAnswer(factoid_questions[0].format(i, p2), "Yes.", "metadata.txt", "1")
                seen.add(p2)
                questions.append(q2)
            else:
                q2 = createAnswer(factoid_questions[0].format(i, p2), "No.", "metadata.txt", "1")
                questions.append(q2)

['Q: Is Giorgio Verdiani the author for The earlier Mona Lisa: creating a tactile physical model for transversal sharing and learning during the exhibition?\nA: Yes.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Giorgio Verdiani the author for Model-Generated Pretraining Signals Improves Zero-Shot Generalization of Text-to-Text Transformers?\nA: No.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Giorgio Verdiani the author for InPars-Light: Cost-Effective Unsupervised Training of Efficient Rankers?\nA: No.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Giorgio Verdiani the author for Aligning Large Multimodal Models with Factually Augmented RLHF?\nA: No.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Tianhua Tao the author for SlimPajama-DC: Understanding Data Combinations for LLM Training?\nA: Yes.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Tianhua Tao the author for Decision Support System for Determining the Best PAUD Teacher Using the MOORA Method?\nA: No.\nD: metadata.txt\nT: 1\n\n',
 'Q: Is Tianhua Tao the author for The Fr