In [17]:
import csv
import pandas as pd
import datetime
import json
import string
from bs4 import BeautifulSoup
from argparse import ArgumentParser


In [None]:
%pip install accelerate

In [11]:
def remove_html(string):
    cleantext = BeautifulSoup(string, "lxml").text
    return cleantext

def load_topic_file(topic_filepath):
    # a method used to read the topic file for this year of the lab; to be passed to BERT/PyTerrier methods
    queries = json.load(open(topic_filepath))
    result = {}
    for item in queries:
      # returing results as dictionary of topic id: [title, body, tag]
      title =  item['Title'].translate(str.maketrans('', '', string.punctuation))
      #removing html from body
      body = remove_html(item['Body']).translate(str.maketrans('', '', string.punctuation))
      tags = item['Tags']
      result[item['Id']] = [title, body, tags]
    return result

def read_qrel_file(qrel_filepath):
    # a method used to read the topic file
    result = {}
    with open(qrel_filepath, "r") as f:
        reader = csv.reader(f, delimiter='\t', lineterminator='\n')
        for line in reader:
            query_id = line[0]
            doc_id = line[2]
            score = int(line[3])
            if query_id in result:
                result[query_id][doc_id] = score
            else:
                result[query_id] = {doc_id: score}
    # dictionary of key:query_id value: dictionary of key:doc id value: score
    return result

def read_collection(answer_filepath):
  # Reading collection to a dictionary
  lst = json.load(open(answer_filepath))
  result = {}
  for doc in lst:
    #processes the answers to remove html and punctuation.
    result[doc['Id']] = remove_html(doc['Text']).translate(str.maketrans('', '', string.punctuation))
  return result
#modifies each query to contain the title and body
def prep_queries(topics):
    queries = {}
    for query_id in topics:
        queries[query_id] = "[TITLE]" + topics[query_id][0] + "[BODY]" + topics[query_id][1]
    return queries

from sentence_transformers import InputExample

def cross_dataset_gen(queries,qrel,answers):
    result_list = []
    sample_list =[]
    for topic in qrel:
        print(f"Key: {topic}, Value: {qrel[topic]}")
        for doc, score in qrel[topic].items():
            pair = (queries[topic],answers[doc],score)
            result_list.append(pair)

    for pair in result_list:
        ex_1 = pair[0]
        ex_2 = pair[1]
        label = pair[2]
        if label >=1:
            label = 1
        sample_list.append(InputExample(texts=[pair[0],pair[1]],label=label))

    return sample_list



In [12]:
"""def main():
    parser = ArgumentParser()
    parser.add_argument('-i', '--input', required=True, help='search domain file e.g. Answers.json', default="Answers.json")
    parser.add_argument('-t', '--topic', required=True, help='topic source files e.g. topics_1.json', default="topics_1.json")
    parser.add_argument('-q', '--qrel', required=True, help='qrel source files e.g. qrel_1.tsv', default="qrel_1.tsv")
    args = parser.parse_args()
    answer_filepath = args.input
    topic_filepath = args.topic
    qrel_filepath = args.qrel

    topics = load_topic_file(topic_filepath)
    qrel = read_qrel_file(qrel_filepath)
    answers = read_collection(answer_filepath)

    queries = prep_queries(topics)
"""

'def main():\n    parser = ArgumentParser()\n    parser.add_argument(\'-i\', \'--input\', required=True, help=\'search domain file e.g. Answers.json\', default="Answers.json")\n    parser.add_argument(\'-t\', \'--topic\', required=True, help=\'topic source files e.g. topics_1.json\', default="topics_1.json")\n    parser.add_argument(\'-q\', \'--qrel\', required=True, help=\'qrel source files e.g. qrel_1.tsv\', default="qrel_1.tsv")\n    args = parser.parse_args()\n    answer_filepath = args.input\n    topic_filepath = args.topic\n    qrel_filepath = args.qrel\n\n    topics = load_topic_file(topic_filepath)\n    qrel = read_qrel_file(qrel_filepath)\n    answers = read_collection(answer_filepath)\n\n    queries = prep_queries(topics)\n'

In [13]:
topics = load_topic_file("topics_1.json")
qrel = read_qrel_file("qrel_1.tsv")
answers = read_collection("Answers.json")
queries = prep_queries(topics)
#print(queries)

In [19]:
import transformers
import torch

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

system_prompt = {"role": "System","content": """You are an expert at answering ubuntu related questions on stackoverflow the question 
                and answer website, given a question you craft an suitable answer in plain text with no markdown formatting."""}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
def gen_new_queries(queries, system_prompt,pipe): 
    #quids = queries.keys()
    #list_queries = list(queries.values())

    #this will take in each query, pass its combined query to the LLM which will rewrite the query into an answer
    #We do this so as to allow semantic searching answer -> answer rather than question -> answer.
    #we then export to TSV

    for query_id in queries:
        query_input = {"role": "user", "content": queries[query_id]}
        messages = [system_prompt,query_input]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        terminators = [pipe.tokenizer.eos_token_id, pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
        outputs = pipe(prompt, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6,
        top_p=0.9, pad_token_id = pipe.tokenizer.eos_token_id)
        result= outputs[0]["generated_text"][len(prompt):].strip()
        queries[query_id] = result
        print("processed qid "+query_id)
    return queries

In [None]:
query_answers = gen_new_queries(queries,system_prompt,pipe)


In [22]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name())

True
NVIDIA GeForce RTX 4090


In [26]:
tmp_q = {k: [v] for k, v in query_answers.items()}

In [36]:
qd = pd.DataFrame.from_dict(tmp_q,orient='index', columns=['Content'])
qd.reset_index(inplace=True)
qd.rename(columns={'index': 'DocID'}, inplace=True)
qd.to_csv('queries.tsv', sep='\t', index=False)

In [58]:
ad = pd.DataFrame.from_dict(answers,orient='index', columns=['Content'])
ad.reset_index(inplace=True)
ad.rename(columns={'index': 'DocID'}, inplace=True)

In [None]:
#using ChromaDB as a vecotr database for searching, it uses sbert as a text tokenizer.
%pip install chromadb 

In [40]:
import os
cwd = os.getcwd()
print(cwd)

/home/caleb/school/information-retrieval/project_3


In [42]:
import chromadb
db_path = cwd+"/chroma"
client = chromadb.PersistentClient(path=db_path)


In [None]:
#we select the embedding function to use with the DB
#we can also use local models
from chromadb.utils import embedding_functions
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")

In [85]:
collection = client.create_collection(name="answers")


In [86]:
def batch(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

In [87]:
#Here we build corpus that we will search over!
#Fortunatly, Chroma allows metadata like DocID to be stored with embeddings, this makes analysing the quality of the retreival 

doc_ids = ad['DocID'].tolist()
documents = ad['Content'].tolist()

max_batch_size = 1000
doc_batches = list(batch(documents, max_batch_size))
id_batches = list(batch(doc_ids, max_batch_size))
i=0
for doc_batch, id_batch in zip(doc_batches, id_batches):
    collection.add(
        documents=doc_batch,
        ids=id_batch
    )
    i+=1
    print("batch: "+str(i))    

batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
batch: 46
batch: 47
batch: 48
batch: 49
batch: 50
batch: 51
batch: 52
batch: 53
batch: 54
batch: 55
batch: 56
batch: 57
batch: 58
batch: 59
batch: 60
batch: 61
batch: 62
batch: 63
batch: 64
batch: 65
batch: 66
batch: 67
batch: 68
batch: 69
batch: 70
batch: 71
batch: 72
batch: 73
batch: 74
batch: 75
batch: 76
batch: 77
batch: 78
batch: 79
batch: 80
batch: 81
batch: 82
batch: 83
batch: 84
batch: 85
batch: 86
batch: 87
batch: 88
batch: 89
batch: 90
batch: 91
batch: 92
batch: 93
batch: 94
batch: 95
batch: 96
batch: 97
batch: 98
batch: 99
batch: 100
batch: 1

In [84]:
#deletes the topics1 collection
client.delete_collection("answers")

In [None]:
collection.peek()

In [89]:
collection.query(query_texts="hello world")

{'ids': [['692870',
   '762348',
   '1433100',
   '450861',
   '692874',
   '1372106',
   '387748',
   '1015152',
   '1097243',
   '1298924']],
 'embeddings': None,
 'documents': [[' Is there any easy way to install helloworld on to my computerThe next actions you need to domake the file executable with chmod 775 helloworld from the directory where the file iscopy it over to a directory in your PATH I would suggest sudo cp helloworld usrlocalbin local since it is your local system and bin since it is a binairy And then you can dohelloworldfrom any location on the system to have it print Hello world',
   'You should run the command as helloworldshowdev',
   'awk printprint  helloworldtxt',
   'echo Hello World orprintf Hello World or not for newbiesstrHello World  str  grep o str',
   'sudo install helloworld usrlocalbin installs it to usrlocalbin read man installBtw your helloworldc should really beinclude int mainvoid    printfHello World    return 0',
   'Try thisif  5 gt 1 thenecho 