In [17]:
import csv
import pandas as pd
import datetime
import json
import string
from bs4 import BeautifulSoup
from argparse import ArgumentParser


In [None]:
%pip install accelerate

In [11]:
def remove_html(string):
    cleantext = BeautifulSoup(string, "lxml").text
    return cleantext

def load_topic_file(topic_filepath):
    # a method used to read the topic file for this year of the lab; to be passed to BERT/PyTerrier methods
    queries = json.load(open(topic_filepath))
    result = {}
    for item in queries:
      # returing results as dictionary of topic id: [title, body, tag]
      title =  item['Title'].translate(str.maketrans('', '', string.punctuation))
      #removing html from body
      body = remove_html(item['Body']).translate(str.maketrans('', '', string.punctuation))
      tags = item['Tags']
      result[item['Id']] = [title, body, tags]
    return result

def read_qrel_file(qrel_filepath):
    # a method used to read the topic file
    result = {}
    with open(qrel_filepath, "r") as f:
        reader = csv.reader(f, delimiter='\t', lineterminator='\n')
        for line in reader:
            query_id = line[0]
            doc_id = line[2]
            score = int(line[3])
            if query_id in result:
                result[query_id][doc_id] = score
            else:
                result[query_id] = {doc_id: score}
    # dictionary of key:query_id value: dictionary of key:doc id value: score
    return result

def read_collection(answer_filepath):
  # Reading collection to a dictionary
  lst = json.load(open(answer_filepath))
  result = {}
  for doc in lst:
    #processes the answers to remove html and punctuation.
    result[doc['Id']] = remove_html(doc['Text']).translate(str.maketrans('', '', string.punctuation))
  return result
#modifies each query to contain the title and body
def prep_queries(topics):
    queries = {}
    for query_id in topics:
        queries[query_id] = "[TITLE]" + topics[query_id][0] + "[BODY]" + topics[query_id][1]
    return queries

from sentence_transformers import InputExample

def cross_dataset_gen(queries,qrel,answers):
    result_list = []
    sample_list =[]
    for topic in qrel:
        print(f"Key: {topic}, Value: {qrel[topic]}")
        for doc, score in qrel[topic].items():
            pair = (queries[topic],answers[doc],score)
            result_list.append(pair)

    for pair in result_list:
        ex_1 = pair[0]
        ex_2 = pair[1]
        label = pair[2]
        if label >=1:
            label = 1
        sample_list.append(InputExample(texts=[pair[0],pair[1]],label=label))

    return sample_list



In [12]:
"""def main():
    parser = ArgumentParser()
    parser.add_argument('-i', '--input', required=True, help='search domain file e.g. Answers.json', default="Answers.json")
    parser.add_argument('-t', '--topic', required=True, help='topic source files e.g. topics_1.json', default="topics_1.json")
    parser.add_argument('-q', '--qrel', required=True, help='qrel source files e.g. qrel_1.tsv', default="qrel_1.tsv")
    args = parser.parse_args()
    answer_filepath = args.input
    topic_filepath = args.topic
    qrel_filepath = args.qrel

    topics = load_topic_file(topic_filepath)
    qrel = read_qrel_file(qrel_filepath)
    answers = read_collection(answer_filepath)

    queries = prep_queries(topics)
"""

'def main():\n    parser = ArgumentParser()\n    parser.add_argument(\'-i\', \'--input\', required=True, help=\'search domain file e.g. Answers.json\', default="Answers.json")\n    parser.add_argument(\'-t\', \'--topic\', required=True, help=\'topic source files e.g. topics_1.json\', default="topics_1.json")\n    parser.add_argument(\'-q\', \'--qrel\', required=True, help=\'qrel source files e.g. qrel_1.tsv\', default="qrel_1.tsv")\n    args = parser.parse_args()\n    answer_filepath = args.input\n    topic_filepath = args.topic\n    qrel_filepath = args.qrel\n\n    topics = load_topic_file(topic_filepath)\n    qrel = read_qrel_file(qrel_filepath)\n    answers = read_collection(answer_filepath)\n\n    queries = prep_queries(topics)\n'

In [13]:
topics = load_topic_file("topics_1.json")
qrel = read_qrel_file("qrel_1.tsv")
answers = read_collection("Answers.json")
queries = prep_queries(topics)
#print(queries)

In [19]:
import transformers
import torch

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

system_prompt = {"role": "System","content": """You are an expert at answering ubuntu related questions on stackoverflow the question 
                and answer website, given a question you craft an suitable answer in plain text with no markdown formatting."""}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
def gen_new_queries(queries, system_prompt,pipe): 
    #quids = queries.keys()
    #list_queries = list(queries.values())

    #this will take in each query, pass its combined query to the LLM which will rewrite the query into an answer
    #We do this so as to allow semantic searching answer -> answer rather than question -> answer.

    for query_id in queries:
        query_input = {"role": "user", "content": queries[query_id]}
        messages = [system_prompt,query_input]
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        terminators = [pipe.tokenizer.eos_token_id, pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
        outputs = pipe(prompt, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6,
        top_p=0.9, pad_token_id = pipe.tokenizer.eos_token_id)
        result= outputs[0]["generated_text"][len(prompt):].strip()
        queries[query_id] = result
        print("processed qid "+query_id)
    return queries

In [None]:
query_answers = pd.DataFrame(gen_new_queries(queries,system_prompt,pipe))


processed qid 3
processed qid 196614
processed qid 6
processed qid 7
processed qid 9
processed qid 262154
processed qid 11
processed qid 65549
processed qid 98319
processed qid 360466
processed qid 360478
processed qid 32800
processed qid 34
processed qid 36
processed qid 37
processed qid 1245227
processed qid 32812
processed qid 163903
processed qid 66
processed qid 67
processed qid 69
processed qid 393289
processed qid 131149
processed qid 1409105
processed qid 98393
processed qid 131168
processed qid 108
processed qid 491629
processed qid 65645
processed qid 111
processed qid 98416
processed qid 163962
processed qid 327807
processed qid 127
processed qid 131
processed qid 134
processed qid 655496
processed qid 142
processed qid 688270
processed qid 295059
processed qid 295060
processed qid 131233
processed qid 164
processed qid 1409192
processed qid 182
processed qid 98489
processed qid 917695
processed qid 193
processed qid 209
processed qid 688338
processed qid 458967
processed qi

In [None]:
"""usr_prompt = {"role": "user","content": "[TITLE]How can I set the Software Center to install software for nonroot users[BODY]How can I set the Software Center to allow nonroot users to install stuff from the Ubuntu repos without having to type in their passwordIm fully aware of the security implications and I am willing to take the risk Fedora 12 shipped with something like this By modifying the PolicyKit configuration I believe"}
messages = [system_prompt,usr_prompt]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
terminators = [pipe.tokenizer.eos_token_id, pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
outputs = pipe(prompt, max_new_tokens=256, eos_token_id=terminators, do_sample=True, temperature=0.6,
top_p=0.9, pad_token_id = pipe.tokenizer.eos_token_id)
result= outputs[0]["generated_text"][len(prompt):].strip()
print(result)
"""

In [None]:
import transformers
import torch

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Define the model ID and load the pipeline with the specified device
model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device=device  # Use GPU if available, otherwise fallback to CPU
)

# Define the prompt
prompt = "Tell me a fun fact about New York."

# Generate a response using the pipeline
response = pipeline(prompt, max_length=100)

# Print the generated response
print(response[0]['generated_text'])

In [22]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name())

True
NVIDIA GeForce RTX 4090
