In [1]:
%pip install torch transformers faiss-cpu numpy rouge-score nltk sacrebleu
%pip install sentence-transformers
%pip install langchain
%pip install -U langchain-community




In [2]:
# import libraries
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import time
import psutil
from functools import lru_cache
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [3]:
def clean_definitions(definitions):
    cleaned_definitions = []
    for definition in definitions:
        cleaned_definition = definition.strip().replace('\n', ' ').replace('&lt;', '<').replace('&gt;', '>')
        cleaned_definitions.append(cleaned_definition)
    return cleaned_definitions


In [4]:
# define class fit the format
class Document:
    def __init__(self, page_content, metadata=None, doc_id=None):
        self.page_content = page_content
        self.metadata = metadata or {}
        self.id = doc_id if doc_id is not None else hash(page_content)


# Load definition files
def load_definitions(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

In [5]:
# Generate embeddings for definitions
class Retriever:
    def __init__(self, definitions, num_retrieved_docs=5):
        all_documents = [Document(definition, doc_id=str(i)) for i, definition in enumerate(definitions)]
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.db = FAISS.from_documents(all_documents, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})
        self.index = self.db.index

    def search(self, query):
        return self.retriever.get_relevant_documents(query)

In [6]:
models = ['t5-small', 'google/flan-t5-small','google/flan-t5-large','google/flan-t5-xxl']

# setup the generation model
class T5Assistant:
    def __init__(self, model_names):
        self.models = {}
        for model_name in model_names:
            print(f"Loading model: {model_name}")
            tokenizer = T5Tokenizer.from_pretrained(model_name)
            model = T5ForConditionalGeneration.from_pretrained(model_name)
            self.models[model_name] = {
                "tokenizer": tokenizer,
                "model": model
            }

    def create_prompt(self, query, retrieved_info):
        return (f"Explain the concept or answer the question in a detailed manner using simple words and examples.\n"
                f"Instruction: {query}\n"
                f"Relevant information: {retrieved_info}\n"
                f"Output:")

    def generalreplyies(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        results = {}

        for model_name, components in self.models.items():
            tokenizer = components["tokenizer"]
            model = components["model"]

            input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids
            outputs = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            results[model_name] = generated_text

        return results




In [7]:
def cleanreplys(reply):
    cleaned_reply = reply.replace("Instruction:", "").replace("Relevant information:", "").strip()
    return cleaned_reply


In [8]:
def bleuresults(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    smoothie = SmoothingFunction().method4
    return sentence_bleu(reference, candidate, smoothing_function=smoothie)

def print_memory_usage():
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.2f} MB")

@lru_cache(maxsize=10)
def cached_generate_reply(assistant, query, retrieved_info):
    return assistant.generate_reply(query, retrieved_info)

In [9]:
def memories():
    process = psutil.Process()
    memory_info = process.memory_info()
    return memory_info.rss / 1024 / 1024

if __name__ == "__main__":
    definitions = load_definitions('ctx_pd.txt')
    definitions = clean_definitions(definitions)

    # initialized the retriever models and t5 models
    retriever = Retriever(definitions, num_retrieved_docs=5)
    generatesmodels = T5Assistant(models)

    # save the results
    results_list = []

    # sample questions
    generalquestions = [
        "What is the definition of Articulation?",
        "Which accreditation framework is mentioned under the definition of Doctoral Degree?",
        "Which organization is referenced in the definition of Assessment?",
        "What is the difference between a Bachelor Degree and a Bachelor Honours Degree with reference to qualification level in the AQF?",
        "If a student studies in a foreign educational institution and is not a citizen or permanent resident of Australia, what term would the dataset use to categorize them as per the definitions?",
        "If a student completes a Certificate III and intends to directly pursue a Bachelor Degree, which concept from this dataset would likely apply to their transition?",
        "Describe the relationship between Foundation Course and Pathway Course based on their respective definitions.",
        "What percentage of modules studied that received a pass grade is referred to in the dataset, and under what name is this metric captured?",
        "If a student qualifies under the Doctoral Degree (Research) category of the AQF, what learning outcome is significant in their qualification process?",
        "Based on the definitions provided, how would the process of Admission differ from the process of Application, and what criteria must a student meet to progress from one to the other?"
    ]

    # search for every queries
    for query in generalquestions:
        startings = time.time()

        # research for the related definition
        retrieved_docs = retriever.search(query)
        retrinfo = " ".join([doc.page_content for doc in retrieved_docs[:3]])
        referencess = [doc.page_content for doc in retrieved_docs]

        # generate a definition
        generated_replies = generatesmodels.generalreplyies(query, retrinfo)

        # saving result
        for model_name, reply in generated_replies.items():
            reply = cleanreplys(reply)
            blues = bleuresults(" ".join(referencess), reply) if referencess else "N/A"
            memory_usage = memories()

            ending = time.time()
            timing = ending - startings

            # save the result in the list
            results_list.append({
                'Query': query,
                'Model': model_name,
                'Generated Reply': reply,
                'Reference Answer': " ".join(referencess),
                'BLEU Score': blues,
                'Memory Usage (MB)': memory_usage,
                'Time Taken (seconds)': timing
            })

        print_memory_usage()

    # transfer to dataframe
    resultss = pd.DataFrame(results_list)

    # save the result
    resultss.to_csv('t5results.csv', index=False)

    print("Results saved to t5results.csv")

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading model: t5-small


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Loading model: google/flan-t5-small
Loading model: google/flan-t5-large
Loading model: google/flan-t5-xxl


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

  return self.retriever.get_relevant_documents(query)


Memory Usage: 39929.26 MB
Memory Usage: 42772.29 MB
Memory Usage: 47012.56 MB
Memory Usage: 45050.73 MB
Memory Usage: 47088.84 MB
Memory Usage: 42302.36 MB
Memory Usage: 43146.47 MB
Memory Usage: 44833.51 MB
Memory Usage: 43251.58 MB
Memory Usage: 43728.80 MB
Results saved to t5results.csv
