# Example of a Project of a Chatbot based on LLM

This is...

## 1. Requirements

As follows the required dependencies will be instaled into workspace.

In [None]:
!pip install transformers faiss-cpu datasets fastapi uvicorn

## 2. Extract PDF Information

In [None]:
import fitz

def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Example
path = "docs/info.pdf"
text = extract_text_from_pdf(path)
print(text)    

# 3. Generate Model based on a Foundational Model and Fine Tunning

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# load foundational model
model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prepare specific data
path_concepts = "data/concepts.pdf"
text_concepts = extract_text_from_pdf(path_concepts)
dataset = Dataset.from_dict({"text": [text_concepts]})

# tokenize data
def tokenize_function(concepts):
    return tokenizer(concepts["text"], padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# train model with fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

## 4. RAG

In [None]:
import faiss
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# load retriever
index = faiss.IndexFlatL2(768)
documents = ["data/updates.pdf"]
embeddings = np.array([embed(doc) for doc in documents])
index.add(embeddings)

def retrieve(query: str) -> str:
    query_embedding = embed(query)
    D, I = index.search(np.array([query_embedding]), 1)
    return [documents[i] for i in I[0]]


In [None]:
# load generator
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

def generate_response(prompt: str) -> str:
    retrieved_document = retrieve(prompt)
    context = " ".join(retrieved_document)
    input_text = f"{context} {prompt}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens=False)
    output = model.generate(inputs, max_length=150, num_return_sequences=1)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# example
prompt = "What is the latest update?"
response = generate_response(prompt)
print(response)

# 5. Deploy FastAPI

In [None]:
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class Query(BaseModel):
    prompt: str

@app.post("/chatbot")
def chatbot(query: Query):
    try:
        response = generate_response(query.prompt)
        return {"response": response}
    except Exception as e:
        return {"error": str(e)}

    
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)               
