# LLM
vamos a crear un llm usando python

## Requerimientos
instalamos los siguientes requerimientos: (!pip para que lo instale en este cuaderno):

In [1]:
%pip install transformers[torch] datasets fastapi faiss-cpu uvicorn pymupdf torch torchvision torchaudio numpy accelerate

Note: you may need to restart the kernel to use updated packages.


## Extraer la informacion del pdf

In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    # Abre el archivo PDF
    document = fitz.open(pdf_path)
    
    # Extrae texto de cada página
    text = ""
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        text += page.get_text("text")
    
    # Cierra el documento
    document.close()
    
    return text

## Generamos un modelo fundacional usando finetunning

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Load foundational model
model_name = "google-bert/bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# prepare specific data
text_concepts = extract_text_from_pdf("docs/concepts.pdf")
dataset = Dataset.from_dict({"text": [text_concepts], "labels": [0]})  # Add a dummy label

# tokenize the data
def tokenize_function(concepts):
    return tokenizer(concepts["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# train model with fine-tuning

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=56,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:00<00:00,  5.29 examples/s]


Step,Training Loss


TrainOutput(global_step=56, training_loss=0.473700898034232, metrics={'train_runtime': 242.73, 'train_samples_per_second': 0.231, 'train_steps_per_second': 0.231, 'total_flos': 14734219100160.0, 'train_loss': 0.473700898034232, 'epoch': 56.0})

## RAG
fine-tunning le da un conocimiento base que no se modifica. RAG es darle los conceptos modernos o actualizado

In [None]:
import faiss
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoModel, AutoTokenizer


# load retriever
index = faiss.IndexFlatIP(768)
documents = ["docs/upgrades.pdf"]
embeddings = np.array([embed(doc) for doc in documents])
index.add(embeddings)

# load generator
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# funcion para obtener info
def retrieve(query: str) -> str:
    # embed query
    query_embedding = embed(query)
    
    # retrieve
    D, I = index.search(np.array([query_embedding]), 1)
    
    return [documents[i] for i in I[0]]

AssertionError: 

In [None]:
def generate_response(promt: str) -> str:
    retrieved_document = retrieve(promt)
    context = " ".join(retrieved_document)
    input_text = f"{context} {promt}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", add_special_tokens=False)
    output = model.generate(inputs, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# example
promt = "What is the difference between a concept and a feature?"
response = generate_response(promt)
print(response)

In [None]:
from fastapi import Query, app


@app.post("/chatbot")
def chatbot(query: Query):
    try:
        response = generate_response(query.promt)
        return {"response": response}
    except Exception as e:
        return {"response": str(e)}
    
if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)