In [17]:
!pip install torch transformers faiss-cpu numpy pandas langchain nltk psutil sentence-transformers
!pip install -U langchain-community





In [18]:
# import necessary tookit
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import json
import time
import psutil
from functools import lru_cache
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [19]:
# define the document
class Document:
    def __init__(self, page_content, metadata=None, doc_id=None):
        self.page_content = page_content
        self.metadata = metadata or {}
        self.id = doc_id if doc_id is not None else hash(page_content)

# data cleaning
def clean_definitions(definitions):
    cleaned_definitions = []
    for definition in definitions:
        cleaned_definition = definition.strip().replace('\n', ' ').replace('&lt;', '<').replace('&gt;', '>')
        cleaned_definitions.append(cleaned_definition)
    return cleaned_definitions

# process the document
def load_definitions(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file if line.strip()]

In [20]:
# define the retriever model
class Retriever:
    def __init__(self, definitions, num_retrieved_docs=5):
        all_documents = [Document(definition, doc_id=str(i)) for i, definition in enumerate(definitions)]
        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.db = FAISS.from_documents(all_documents, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})
        self.index = self.db.index

    def search(self, query):
        return self.retriever.get_relevant_documents(query)

In [21]:
# T5 model
models = ['t5-small', 'google/flan-t5-small','google/flan-t5-large','google/flan-t5-xxl']

class T5Assistant:
    def __init__(self, model_names):
        self.models = {}
        for model_name in model_names:
            print(f"Loading model: {model_name}")
            tokenizer = T5Tokenizer.from_pretrained(model_name)
            model = T5ForConditionalGeneration.from_pretrained(model_name)
            self.models[model_name] = {
                "tokenizer": tokenizer,
                "model": model
            }

    def create_prompt(self, query, retrieved_info):
        return (f"Explain the concept or answer the question in a detailed manner using simple words and examples.\n"
                f"Instruction: {query}\n"
                f"Relevant information: {retrieved_info}\n"
                f"Output:")

    def generalreplyies(self, prompt, retrieved_info):
        results = {}
        for model_name, components in self.models.items():
            tokenizer = components["tokenizer"]
            model = components["model"]

            input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids
            outputs = model.generate(input_ids, max_length=100, num_beams=5, early_stopping=True)
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            results[model_name] = generated_text

        return results


In [22]:
# manage history conversation
def create_chatbot_prompt(query, chat_history, retrieved_info):
    history_str = "\n".join([f"User: {q}\nBot: {a}" for q, a in chat_history[-5:]])
    return (f"Carry on the conversation and provide detailed information.\n"
            f"Context:\n{history_str}\n"
            f"Instruction: {query}\n"
            f"Relevant information: {retrieved_info}\n"
            f"Output:")

def replychat(assistant, query, chat_history, retriever):
    retrieved_docs = retriever.search(query)
    retrinfo = " ".join([doc.page_content for doc in retrieved_docs[:3]])
    prompt = create_chatbot_prompt(query, chat_history, retrinfo)
    generated_replies = assistant.generalreplyies(prompt, retrinfo)

    return generated_replies


In [23]:
# clean the reply message
def cleanreplys(reply):
    cleaned_reply = reply.replace("Instruction:", "").replace("Relevant information:", "").strip()
    return cleaned_reply

# recode the memory useage
def printmemorys():
    process = psutil.Process()
    memory_info = process.memory_info()
    print(f"Memory Usage: {memory_info.rss / 1024 / 1024:.2f} MB")


In [24]:
if __name__ == "__main__":
    # process the definition
    definitions = load_definitions('ctx_pd.txt')
    definitions = clean_definitions(definitions)

    # initialized the retrievel ahd generate model
    retriever = Retriever(definitions, num_retrieved_docs=5)
    t5sassistances = T5Assistant(models)

    # initialized the chat history
    chat_history = []

    # conversation repeat
    while True:
        user_input = input("What is the query: ")
        if user_input.lower() in ["exit", "quit"]:
            break

        # generalized the result
        generalreply = replychat(t5sassistances, user_input, chat_history, retriever)

        # print out the result
        for model_name, reply in generalreply.items():
            cleaned_reply = cleanreplys(reply)
            print(f"\nModel: {model_name}\nBot: {cleaned_reply}")

        # update the history
        chat_history.append((user_input, generalreply['google/flan-t5-small']))

        printmemorys()



Loading model: t5-small
Loading model: google/flan-t5-small
Loading model: google/flan-t5-large
Loading model: google/flan-t5-xxl


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

User: Which accreditation framework is mentioned under the definition of Doctoral Degree?

Model: t5-small
Bot: :  Which accreditation framework is mentioned under the definition of Doctoral Degree? Context:  Which accreditation framework is mentioned under the definition of Doctoral Degree?  "Doctoral Degree is Course with major research component: comprised of two-thirds or more research leading to a thesis/dissertation OR qualifies individuals who apply a substantial body of knowledge to research, investigate and develop new knowledge, in one or more fields of investigation

Model: google/flan-t5-small
Bot: Higher Education Standards Framework

Model: google/flan-t5-large
Bot: Australian Qualifications Framework

Model: google/flan-t5-xxl
Bot: Australian Qualifications Framework
Memory Usage: 35080.66 MB
User: exit
